diff --git a/README.md b/README.md
index 98e6dca..e5a6c4a 100644
--- a/README.md
+++ b/README.md
@@ -30,7 +30,7 @@ And follow one of our tutorials [here](docs/tutorials).
 This setup assumes that you have built LLVM and MLIR in `$BUILD_DIR` and
 installed it to `$PREFIX`. 
 The current version of this project was tested with `llvm-project` commit:
-`339a7687e1c036a5f91c9d5391523b93e2e76cd3`.
+`08d094a0e457360ad8b94b017d2dc277e697ca76`.
 Make sure you have the correct commit checked-out.
 
 **Note**: Make sure to pass `-DLLVM_INSTALL_UTILS=ON` when building LLVM/MLIR 
diff --git a/include/soda/Conversion/AccelToRuntime/AccelToAXI4MLIR.h b/include/soda/Conversion/AccelToRuntime/AccelToAXI4MLIR.h
new file mode 100644
index 0000000..d909547
--- /dev/null
+++ b/include/soda/Conversion/AccelToRuntime/AccelToAXI4MLIR.h
@@ -0,0 +1,58 @@
+//===- AccelToAXI4MLIR.h - Convert Accel to AXI4MLIR calls ----*- C++ -*-===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SODA_CONVERSION_ACCELTORUNTIME_ACCELTOAXI4MLIR_H_
+#define SODA_CONVERSION_ACCELTORUNTIME_ACCELTOAXI4MLIR_H_
+
+#include "mlir/IR/PatternMatch.h"
+
+namespace mlir {
+class MLIRContext;
+class Pass;
+class RewritePatternSet;
+class ModuleOp;
+template <typename T>
+class OperationPass;
+
+struct AccelToAXI4MLIROptions {
+  /// Accelerator Tile Size information
+  unsigned tileSize = 1;
+
+  /// DMA Information
+  unsigned dmaAddress = 0;
+  unsigned dmaInputAddress = 0;
+  unsigned dmaInputBufferSize = 100000;
+  unsigned dmaOutputAddress = 100000;
+  unsigned dmaOutputBufferSize = 100000;
+
+  /// Flow information
+  bool flowCpuAcc = false;
+  unsigned numberOfCaches = false;
+  ArrayRef<unsigned> cacheSizes;
+  ArrayRef<unsigned> tileSizes;
+  unsigned elementSize = false;
+};
+
+/// Populate the given list with patterns that convert from Accel to AXI4MLIR
+/// runtime calls.
+void populateAccelToAXI4MLIRConversionPatterns(RewritePatternSet &patterns);
+
+/// Populate the given list with patterns that convert from Accel to AXI4MLIR
+/// runtime calls.
+void populateAccelToAXI4MLIRConversionPatternsWithOptions(
+    RewritePatternSet &patterns,
+    const AccelToAXI4MLIROptions &options = AccelToAXI4MLIROptions());
+
+/// Create the pass to convert accel operations to axi4mlir calls
+std::unique_ptr<OperationPass<ModuleOp>> createConvertAccelToAXI4MLIRPass();
+
+std::unique_ptr<OperationPass<ModuleOp>>
+createConvertAccelToAXI4MLIRPass(const AccelToAXI4MLIROptions &options);
+
+} // namespace mlir
+
+#endif // SODA_CONVERSION_ACCELTORUNTIME_ACCELTOAXI4MLIR_H_
diff --git a/include/soda/Conversion/LinalgToAccel/AXI4MLIRUtils.h b/include/soda/Conversion/LinalgToAccel/AXI4MLIRUtils.h
new file mode 100644
index 0000000..3c95a60
--- /dev/null
+++ b/include/soda/Conversion/LinalgToAccel/AXI4MLIRUtils.h
@@ -0,0 +1,73 @@
+//===- Utils.h - Function and method used by axi4mlir passes ----*- C++ -*-===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SODA_CONVERSION_LINALGTOAXI4MLIR_UTILS_H_
+#define SODA_CONVERSION_LINALGTOAXI4MLIR_UTILS_H_
+
+#include "mlir/IR/PatternMatch.h"
+
+namespace mlir {
+class MLIRContext;
+class Pass;
+class RewritePatternSet;
+class PatternRewriter;
+class ModuleOp;
+namespace func {
+class FuncOp;
+} // namespace func
+
+struct AccelTransformationOptions {
+  /// Accelerator Tile Size information
+  unsigned accelSize = 1;
+  ArrayRef<unsigned> accelSizes;
+
+  /// DMA Information
+  unsigned dmaAddress = 0;
+  unsigned dmaInputAddress = 0;
+  unsigned dmaInputBufferSize = 100000;
+  unsigned dmaOutputAddress = 100000;
+  unsigned dmaOutputBufferSize = 100000;
+
+  /// Flow information
+
+  /// IDs of opcodes that should be accumulated on the CPU
+  ArrayRef<unsigned> accOnCpu;
+  bool flowCpuAcc = false;
+  unsigned numberOfCaches = false;
+  ArrayRef<unsigned> cacheSizes;
+  ArrayRef<unsigned> tileSizes;
+  unsigned elementSize = false;
+  ArrayRef<unsigned> loopPermutation;
+
+  /// Anchor
+  std::string anchorFuncName;
+  std::string anchorOpName;
+  std::string anchorFilterName;
+
+  /// Opcode information
+  std::string opcodeMap;
+  std::string initFlow;
+  std::string opcodeFlow;
+
+public:
+  /// Utility to print members of the struct
+  void dump() const;
+};
+
+/// Apply tiling patterns to matmul operations with the correct attribute
+void applyPatterns(func::FuncOp funcOp, const AccelTransformationOptions &options);
+
+/// Populates patterns that implement a FSM of modifications.
+/// Changhing the kLinalgTransformMarker
+/// GENERALIZE -> INTERCHANGE -> MEM(TILE) L3(TILE) -> L2(TILE) -> L1(TILE) ->
+/// ACCEL
+void populateCommonLinalgTransformationPatterns(
+    RewritePatternSet &patterns, const AccelTransformationOptions &options);
+
+} // namespace mlir
+
+#endif // SODA_CONVERSION_LINALGTOAXI4MLIR_UTILS_H_
diff --git a/include/soda/Conversion/LinalgToAccel/LinalgGenericToAccel.h b/include/soda/Conversion/LinalgToAccel/LinalgGenericToAccel.h
new file mode 100644
index 0000000..c922696
--- /dev/null
+++ b/include/soda/Conversion/LinalgToAccel/LinalgGenericToAccel.h
@@ -0,0 +1,39 @@
+//===- LinalgGenericToAccel.h - Convert linalg to AXI4MLIR calls ----*- C++
+//-*-===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SODA_CONVERSION_LINALGTOACCEL_LINALGGENERICTOACCEL_H_
+#define SODA_CONVERSION_LINALGTOACCEL_LINALGGENERICTOACCEL_H_
+
+#include "soda/Conversion/LinalgToAccel/AXI4MLIRUtils.h"
+#include "mlir/IR/PatternMatch.h"
+
+namespace mlir {
+class MLIRContext;
+class Pass;
+class RewritePatternSet;
+class ModuleOp;
+template <typename T>
+class OperationPass;
+
+/// Populate the list with patterns that convert from LinalgOps to AccelOps
+void populateLinalgGenericToAccelConversionPatternsWithOptions(
+    RewritePatternSet &patterns,
+    const AccelTransformationOptions &options = AccelTransformationOptions());
+void populateLinalgGenericToAccelConversionPatterns(
+    RewritePatternSet &patterns);
+
+/// Create the pass to convert from LinalgOps to AccelOps
+std::unique_ptr<OperationPass<ModuleOp>>
+createConvertLinalgGenericToAccelPass();
+
+std::unique_ptr<OperationPass<ModuleOp>> createConvertLinalgGenericToAccelPass(
+    const AccelTransformationOptions &options);
+
+} // namespace mlir
+
+#endif // SODA_CONVERSION_LINALGTOACCEL_LINALGGENERICTOACCEL_H_
diff --git a/include/soda/Conversion/Passes.h b/include/soda/Conversion/Passes.h
index 2d3ba49..519eece 100644
--- a/include/soda/Conversion/Passes.h
+++ b/include/soda/Conversion/Passes.h
@@ -15,6 +15,8 @@
 #include "soda/Conversion/KernelsToSODA/OperationToSODAPass.h"
 #include "soda/Conversion/KernelsToSODA/SCFToSODAPass.h"
 
+#include "soda/Conversion/AccelToRuntime/AccelToAXI4MLIR.h"
+
 namespace mlir {
 namespace soda {
 
diff --git a/include/soda/Conversion/Passes.td b/include/soda/Conversion/Passes.td
index cb6a509..be70695 100644
--- a/include/soda/Conversion/Passes.td
+++ b/include/soda/Conversion/Passes.td
@@ -81,4 +81,19 @@ def ConvertAllToSODA : Pass<"convert-all-to-soda", "func::FuncOp"> {
   ];
 }
 
+//===----------------------------------------------------------------------===//
+// AccelToAXI4MLIR
+//===----------------------------------------------------------------------===//
+
+def ConvertAccelToAXI4MLIR : Pass<"test-accel-to-axi4mlir", "ModuleOp"> {
+  let summary = "Convert accel ops into AXI4MLIR runtime calls";
+  let constructor = "mlir::createConvertAccelToAXI4MLIRPass()";
+  let dependentDialects = [
+    "AffineDialect",
+    "memref::MemRefDialect",
+    "scf::SCFDialect",
+    "LLVM::LLVMDialect",
+  ];
+}
+
 #endif // SODA_CONVERSION_PASSES
diff --git a/include/soda/Dialect/Accel/CMakeLists.txt b/include/soda/Dialect/Accel/CMakeLists.txt
new file mode 100644
index 0000000..f33061b
--- /dev/null
+++ b/include/soda/Dialect/Accel/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(IR)
diff --git a/include/soda/Dialect/Accel/IR/Accel.h b/include/soda/Dialect/Accel/IR/Accel.h
new file mode 100644
index 0000000..8ff7525
--- /dev/null
+++ b/include/soda/Dialect/Accel/IR/Accel.h
@@ -0,0 +1,32 @@
+//===- Accel.h - Accel dialect ------------------------------------*- C++-*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SODA_DIALECT_ACCEL_IR_ACCEL_H_
+#define SODA_DIALECT_ACCEL_IR_ACCEL_H_
+
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
+#include "mlir/Interfaces/VectorInterfaces.h"
+
+//===----------------------------------------------------------------------===//
+// Accel Dialect
+//===----------------------------------------------------------------------===//
+
+#include "soda/Dialect/Accel/IR/AccelOpsDialect.h.inc"
+
+//===----------------------------------------------------------------------===//
+// Accel Dialect Operations
+//===----------------------------------------------------------------------===//
+
+#define GET_OP_CLASSES
+#include "soda/Dialect/Accel/IR/AccelOps.h.inc"
+
+#endif // SODA_DIALECT_ACCEL_IR_ACCEL_H_
diff --git a/include/soda/Dialect/Accel/IR/AccelBase.td b/include/soda/Dialect/Accel/IR/AccelBase.td
new file mode 100644
index 0000000..a6ed582
--- /dev/null
+++ b/include/soda/Dialect/Accel/IR/AccelBase.td
@@ -0,0 +1,20 @@
+//===- AccelBase.td - Base definitions for accel dialect ----*- tablegen -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef ACCEL_BASE
+#define ACCEL_BASE
+include "mlir/IR/OpBase.td"
+def Accel_Dialect : Dialect {
+  let name = "accel";
+  let cppNamespace = "::mlir::accel";
+  let description = [{
+    The accel dialect is intended to hold accel operations that abstract
+    AXI4MLIR DMA communciations.
+  }];
+  let useFoldAPI = kEmitFoldAdaptorFolder;
+}
+#endif // ACCEL_BASE
diff --git a/include/soda/Dialect/Accel/IR/AccelOps.td b/include/soda/Dialect/Accel/IR/AccelOps.td
new file mode 100644
index 0000000..58c5f8d
--- /dev/null
+++ b/include/soda/Dialect/Accel/IR/AccelOps.td
@@ -0,0 +1,109 @@
+//===- AccelOps.td - Accel op definitions ------------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ACCEL_OPS
+#define ACCEL_OPS
+
+include "soda/Dialect/Accel/IR/AccelBase.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+
+// Base class for accel dialect ops. 
+class Accel_Op<string mnemonic, list<Trait> traits = []> :
+    Op<Accel_Dialect, mnemonic, traits>;
+
+//===----------------------------------------------------------------------===//
+// dmaInitOp
+//===----------------------------------------------------------------------===//
+
+def Accel_InitDMAOp : Accel_Op<"init_dma"> {
+  let summary = "initializes the DMA";
+  let description = [{
+    During lowering to AXI4MLIR calls, this op is lowered to a call to
+    `dma_init` at the beginning of the operation scope/basic block, and
+    a call to `dma_free` at the end of the operation scope/basic block.
+    ```
+  }];
+  let arguments = (ins SignlessIntegerLike:$dmaAddress,
+                       SignlessIntegerLike:$dmaInputAddress,
+                       SignlessIntegerLike:$dmaInputBufferSize,
+                       SignlessIntegerLike:$dmaOutputAddress,
+                       SignlessIntegerLike:$dmaOutputBufferSize);
+
+
+ // let results = (outs SignlessIntegerLike:$result);
+
+  let assemblyFormat = [{
+  $dmaAddress `,`
+  $dmaInputAddress `,`
+  $dmaInputBufferSize `,`
+  $dmaOutputAddress `,`
+  $dmaOutputBufferSize
+  attr-dict `:`
+  `(`
+  type($dmaAddress) `,`
+  type($dmaInputAddress) `,`
+  type($dmaInputBufferSize) `,`
+  type($dmaOutputAddress) `,`
+  type($dmaOutputBufferSize) 
+  `)`
+  }];
+}
+
+def Accel_SendOp : Accel_Op<"send"> {
+  let summary = "send MemRef to DMA region";
+  let description = [{
+    TODO
+  }];
+  let arguments = (ins AnyMemRef:$input,
+                       Optional<I32>:$offset_value);
+
+
+  let results = (outs I32:$out_offset);
+
+  let assemblyFormat = [{
+    $input (`,` $offset_value^)?  attr-dict `:` 
+    `(` type($input) (`,` type($offset_value)^)? `)` `->` type($out_offset)
+  }];
+}
+
+def Accel_SendLiteralOp : Accel_Op<"sendLiteral"> {
+  let summary = "send Literal to DMA region";
+  let description = [{
+    Used to send a literal value to the DMA region.
+    The literal value is considered an opcode.
+  }];
+  let arguments = (ins SignlessIntegerLike:$opcode,
+                       Optional<I32>:$offset_value);
+
+
+  let results = (outs I32:$out_offset);
+
+  let assemblyFormat = [{
+    $opcode (`,` $offset_value^)?  attr-dict `:` 
+    `(` type($opcode) (`,` type($offset_value)^)? `)` `->` type($out_offset)
+  }];
+}
+
+def Accel_RecvOp : Accel_Op<"recv"> {
+  let summary = "receive data from the DMA region into the MemRef";
+  let description = [{
+    TODO
+  }];
+  let arguments = (ins AnyMemRef:$dst,
+                       Optional<I32>:$offset_value);
+
+
+  let results = (outs I32:$out_offset);
+
+  let assemblyFormat = [{
+    $dst (`,` $offset_value^)?  attr-dict `:` 
+    `(` type($dst) (`,` type($offset_value)^)? `)` `->` type($out_offset)
+  }];
+}
+
+#endif // ACCEL_OPS
diff --git a/include/soda/Dialect/Accel/IR/CMakeLists.txt b/include/soda/Dialect/Accel/IR/CMakeLists.txt
new file mode 100644
index 0000000..975d364
--- /dev/null
+++ b/include/soda/Dialect/Accel/IR/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_mlir_dialect(AccelOps accel)
+add_mlir_doc(AccelOps AccelOps Dialects/ -gen-dialect-doc)
diff --git a/include/soda/Dialect/CMakeLists.txt b/include/soda/Dialect/CMakeLists.txt
index 8973d6a..2933ef4 100644
--- a/include/soda/Dialect/CMakeLists.txt
+++ b/include/soda/Dialect/CMakeLists.txt
@@ -2,4 +2,5 @@ add_subdirectory(SODA)
 add_subdirectory(SNN)
 add_subdirectory(Linalg)
 add_subdirectory(Affine)
+add_subdirectory(Accel)
 add_subdirectory(Transform)
diff --git a/include/soda/Dialect/SNN/IR/SNNBase.td b/include/soda/Dialect/SNN/IR/SNNBase.td
index 16f32be..76725ba 100644
--- a/include/soda/Dialect/SNN/IR/SNNBase.td
+++ b/include/soda/Dialect/SNN/IR/SNNBase.td
@@ -25,6 +25,7 @@ def SNN_Dialect : Dialect {
   }];
 
   let dependentDialects = ["tensor::TensorDialect"];
+  let useFoldAPI = kEmitFoldAdaptorFolder;
 }
 
 #endif // SNN_BASE
diff --git a/include/soda/Dialect/SODA/SODABase.td b/include/soda/Dialect/SODA/SODABase.td
index cd7f67c..1806945 100644
--- a/include/soda/Dialect/SODA/SODABase.td
+++ b/include/soda/Dialect/SODA/SODABase.td
@@ -53,6 +53,7 @@ def SODA_Dialect : Dialect {
   
   let useDefaultAttributePrinterParser = 1;
   let useDefaultTypePrinterParser = 1;
+  let useFoldAPI = kEmitFoldAdaptorFolder;
 }
 
 def SODA_AsyncToken : DialectType<
diff --git a/include/soda/ExecutionEngine/axi/AxiUtils.h b/include/soda/ExecutionEngine/axi/AxiUtils.h
new file mode 100644
index 0000000..b9d3841
--- /dev/null
+++ b/include/soda/ExecutionEngine/axi/AxiUtils.h
@@ -0,0 +1,168 @@
+//===- AxUtils.h - Utils for debugging MLIR execution -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares AXI4MLIR functions to be called by the host to communicate
+// with AXI enabled accelerators.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef EXECUTIONENGINE_AXIUTILS_H_
+#define EXECUTIONENGINE_AXIUTILS_H_
+
+#ifdef _WIN32
+#ifndef MLIR_AXIRUNNERUTILS_EXPORT
+#ifdef mlir_runner_utils_EXPORTS
+// We are building this library
+#define MLIR_AXIRUNNERUTILS_EXPORT __declspec(dllexport)
+#else
+// We are using this library
+#define MLIR_AXIRUNNERUTILS_EXPORT __declspec(dllimport)
+#endif // mlir_runner_utils_EXPORTS
+#endif // MLIR_AXIRUNNERUTILS_EXPORT
+#else
+#define MLIR_AXIRUNNERUTILS_EXPORT
+#endif // _WIN32
+
+#include <iostream>
+#include <mlir/ExecutionEngine/RunnerUtils.h>
+
+// =============================================================================
+// AXI_APIV1
+// =============================================================================
+
+//-----------------DMA Functions-----------------
+/**
+ * - dma_address is base address of dma
+ * - dma_input_addr is starting memory location for the dma input buffer,
+ * - dma_input_buffer_size is length of the buffer
+ * - dma_output_addr is starting memory location for the dma output buffer,
+ * - dma_output_buffer_size is length of the buffer
+ *
+ *
+ * Runs starting controls signals and sets MMS2, S2MM address registers to start
+ * memory locations of the input and output buffers
+ */
+
+extern "C" MLIR_AXIRUNNERUTILS_EXPORT void
+dma_init(unsigned int dma_address, unsigned int dma_input_address,
+         unsigned int dma_input_buffer_size, unsigned int dma_output_address,
+         unsigned int dma_output_buffer_size);
+
+// Memory unmaps DMA control_register_address and Input and output buffers
+extern "C" MLIR_AXIRUNNERUTILS_EXPORT void dma_free();
+
+//================================================================================================================
+
+//-----------------BUFFER Functions-----------------
+// Get the MMap address of the input buffer of the dma  *Needed to copy data to
+// Input_Buffer*
+extern "C" MLIR_AXIRUNNERUTILS_EXPORT unsigned int *dma_get_inbuffer();
+
+// Get the MMap address of the output buffer of the dma *Needed to copy data
+// from Output_Buffer*
+extern "C" MLIR_AXIRUNNERUTILS_EXPORT unsigned int *dma_get_outbuffer();
+
+//================================================================================================================
+
+//-----------------BUFFER Functions-----------------
+// Copy data into the Input Buffer (length to write, offset to write to) returns
+// 0 if successful
+extern "C" MLIR_AXIRUNNERUTILS_EXPORT int
+dma_copy_to_inbuffer(unsigned int *host_src_address, int data_length,
+                     int offset);
+
+// Copy data from the Output Buffer (length to read, offset to read from)
+// returns 0 if successful
+extern "C" MLIR_AXIRUNNERUTILS_EXPORT int
+dma_copy_from_outbuffer(unsigned int *host_dst_address, int data_length,
+                        int offset);
+
+//-----------------BUFFER Functions-----------------
+// Copy data into the Input Buffer (length to write, offset to write to) returns
+// 0 if successful
+template <typename T>
+int mlir_dma_copy_to_inbuffer(const DynamicMemRefType<T> &src, int data_length,
+                              int offset);
+
+// Copy data from the Output Buffer (length to read, offset to read from)
+// returns 0 if successful
+template <typename T>
+int mlir_dma_copy_from_outbuffer(const DynamicMemRefType<T> &dst,
+                                 int data_length, int offset);
+
+extern "C" MLIR_RUNNERUTILS_EXPORT int
+copy_to_inbuffer_f32(int64_t rank, void *ptr, int offset);
+
+extern "C" MLIR_RUNNERUTILS_EXPORT int
+copy_from_outbuffer_f32(int64_t rank, void *ptr, int offset);
+
+extern "C" MLIR_RUNNERUTILS_EXPORT int
+copy_to_inbuffer_i32(int64_t rank, void *ptr, int offset);
+
+extern "C" MLIR_RUNNERUTILS_EXPORT int
+copy_from_outbuffer_i32(int64_t rank, void *ptr, int offset);
+
+//================================================================================================================
+
+//-----------------DMA MMS2 Functions-----------------
+/**
+ * Checks if input buffer size is >= length
+ * Sets DMA MMS2 transfer length to length
+ * Starts transfers to the accelerator using dma associated with dma_id
+ * Return 0 if successful, returns negative if error occurs
+ */
+extern "C" MLIR_AXIRUNNERUTILS_EXPORT int dma_start_send(int length,
+                                                         int offset);
+
+// Same as dma_send but thread does not block, returns if 0
+extern "C" MLIR_AXIRUNNERUTILS_EXPORT int dma_check_send();
+
+// Blocks thread until dma MMS2 transfer is complete
+extern "C" MLIR_AXIRUNNERUTILS_EXPORT void dma_wait_send();
+
+//-----------------DMA S2MM Functions-----------------
+/**
+ * Checks if buffer size is >= length
+ * Sets 2SMM store length
+ * Starts storing data recieved through dma associated with dma_id
+ * Return 0 if successful, returns negative if error occurs
+ */
+extern "C" MLIR_AXIRUNNERUTILS_EXPORT int dma_start_recv(int length,
+                                                         int offset);
+
+// Blocks thread until dma S2MM transfer is complete (TLAST signal is seen)
+extern "C" MLIR_AXIRUNNERUTILS_EXPORT void dma_wait_recv();
+
+// Same as dma_recv but thread does not block, returns if 0
+extern "C" MLIR_AXIRUNNERUTILS_EXPORT int dma_check_recv();
+
+// Unexposed to MLIR
+extern "C" MLIR_AXIRUNNERUTILS_EXPORT unsigned int
+dma_set(unsigned int *dma_virtual_address, int offset, unsigned int value);
+
+// Unexposed to MLIR
+extern "C" MLIR_AXIRUNNERUTILS_EXPORT unsigned int
+dma_get(unsigned int *dma_virtual_address, int offset);
+
+//-----------------Util Functions-----------------
+
+// Converts memref into llvm_array pointers
+// extern "C" MLIR_AXIRUNNERUTILS_EXPORT unsigned int *
+// memref_to_ptr(UnrankedMemRefType<char> * in_memref) {
+//   return in_memref->descriptor;
+// }
+
+// // Converts pointers into memrefs
+// extern "C" MLIR_AXIRUNNERUTILS_EXPORT UnrankedMemRefType<char>
+// ptr_to_memref(unsigned int *bare_ptr) {
+
+//   UnrankedMemRefType<char> my_memref;
+//   return my_memref;
+// }
+
+#endif // EXECUTIONENGINE_AXIUTILS_H_
diff --git a/include/soda/ExecutionEngine/axi/accelerators/conv_v1/accelerator.sc.h b/include/soda/ExecutionEngine/axi/accelerators/conv_v1/accelerator.sc.h
new file mode 100755
index 0000000..63d9350
--- /dev/null
+++ b/include/soda/ExecutionEngine/axi/accelerators/conv_v1/accelerator.sc.h
@@ -0,0 +1,94 @@
+#ifndef ACC_H
+#define ACC_H
+
+#include "../dma_engine.sc.h"
+#define ACCNAME MM_4x4v1
+
+SC_MODULE(ACCNAME) {
+  sc_in<bool> clock;
+  sc_in<bool> reset;
+  sc_fifo_in<DATA> din1;
+  sc_fifo_out<DATA> dout1;
+
+  // Debug variables
+  int process_blocks;
+  int read_A_len;
+  int read_B_len;
+  int compute_C_len;
+  int send_C_len;
+  bool verbose;
+
+
+  void Recv();
+
+  void print_profile();
+
+  SC_HAS_PROCESS(ACCNAME);
+
+  ACCNAME(sc_module_name name_) : sc_module(name_) {
+    SC_CTHREAD(Recv, clock.pos());
+    reset_signal_is(reset, true);
+
+    process_blocks = 0;
+    read_A_len=0;
+    read_B_len=0;
+    compute_C_len=0;
+    send_C_len=0;
+    verbose = false;
+  }
+};
+
+template <typename Integer>
+void accelerator_dma_connect(ACCNAME *acc, DMA_DRIVER *dmad,
+                             int _dma_input_buffer_size,
+                             int _dma_output_buffer_size) {
+
+  static sc_clock clk_fast("ClkFast", 1, SC_NS);
+  static sc_signal<bool> sig_reset;
+  static sc_fifo<DATA> din1("din1_fifo", _dma_input_buffer_size);
+  static sc_fifo<DATA> dout1("dout1_fifo", _dma_output_buffer_size);
+
+  acc->clock(clk_fast);
+  acc->reset(sig_reset);
+  acc->dout1(dout1);
+  acc->din1(din1);
+
+  dmad->clock(clk_fast);
+  dmad->reset(sig_reset);
+  dmad->dout1(dout1);
+  dmad->din1(din1);
+}
+
+void ACCNAME::print_profile() {
+  cout << "++++++++++++++++++++++++++++++++++++++++" << endl;
+  cout << "Read A data_len: " << read_A_len << endl;
+  cout << "Read B data_len: " << read_B_len << endl;
+  cout << "MACs count: " << compute_C_len << endl;
+  cout << "Send C data_len: " << send_C_len << endl;
+  cout << "++++++++++++++++++++++++++++++++++++++++" << endl;
+  cout << "Executed with :" << __FILE__ << endl;
+  cout << "- - - - - - - - - - - - - - - - - - - - " << endl;;
+}
+
+void ACCNAME::Recv() {
+  wait();
+  while (1) {
+			bool tlast = false;
+			int output = 0;
+			while(!tlast){
+				DATA inp = din1.read();
+				DATA wgt = din1.read();
+				output+= inp.data*wgt.data;
+
+        // cout <<  inp.data << "*" << wgt.data << endl;
+				tlast = (inp.tlast || wgt.tlast);
+        DWAIT();
+			}
+			DATA d;
+			d.tlast = true;
+			d.data = output;
+			dout1.write(d);
+      DWAIT();
+  }
+}
+#endif
diff --git a/include/soda/ExecutionEngine/axi/accelerators/conv_v1/conv_v1.json b/include/soda/ExecutionEngine/axi/accelerators/conv_v1/conv_v1.json
new file mode 100755
index 0000000..e997d57
--- /dev/null
+++ b/include/soda/ExecutionEngine/axi/accelerators/conv_v1/conv_v1.json
@@ -0,0 +1,120 @@
+{
+    "name": "MM_4x4_v1",
+    "version": "1.0",
+    "description": "MM Accelerator",
+
+    "memory_layout": {
+        "#A_Buffer": {
+            "size": 16,
+            "data_type": "int32"
+        },
+        "#B_Buffer": {
+            "size": 16,
+            "data_type": "int32"
+        },
+        "#C_Buffer": {
+            "size": 16,
+            "data_type": "int32"
+        }
+    },
+    
+    "dma_fifo": {
+        "din": {
+            "id": "0",
+            "data_type": "int32",
+            "read": true,
+            "write": false
+        },
+        "dout": {
+            "id": "1",
+            "data_type": "int32",
+            "read": false,
+            "write": true
+        }
+    },
+
+    "kernels": {
+        "4x4_MM": {
+            "id": 0,
+            "description": "4x4 matrix multiplication",
+            "compute": "C += A * B",
+            "tile_info": {
+                "tile_dims": {
+                    "#N": 4,
+                    "#M": 4,
+                    "#K": 4
+                },
+                "A": {
+                    "associated_buffer": "#A_Buffer",
+                    "read": true,
+                    "write": false,
+                    "default_offset": 0,
+                    "shape": [
+                        "#N",
+                        "#K"
+                    ]
+                },
+                "B": {
+                    "associated_buffer": "#B_Buffer",
+                    "read": true,
+                    "write": false,
+                    "default_offset": 0,
+                    "shape": [
+                        "#M",
+                        "#K"
+                    ]
+                },
+                "C": {
+                    "associated_buffer": "#C_Buffer",
+                    "read": true,
+                    "write": true,
+                    "default_offset": 0,
+                    "shape": [
+                        "#N",
+                        "#M"
+                    ]
+                }
+            }
+        }
+    },
+    
+    "ISA": {
+        "instruction_format": {
+            "opcode_length": 0,
+            "op_args": 0
+        },
+        "opcodes": {
+            "-": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#A_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                },
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#B_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                },
+                {
+                    "COMPUTE": {
+                        "kernel_id": 0
+                    }
+                },
+                {
+                    "SEND": {
+                        "dma_fifo_id": 1,
+                        "buffer": "#C_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                }
+            ]
+        }
+    }
+}
\ No newline at end of file
diff --git a/include/soda/ExecutionEngine/axi/accelerators/dma_engine.sc.h b/include/soda/ExecutionEngine/axi/accelerators/dma_engine.sc.h
new file mode 100644
index 0000000..47c229a
--- /dev/null
+++ b/include/soda/ExecutionEngine/axi/accelerators/dma_engine.sc.h
@@ -0,0 +1,91 @@
+#ifndef DMA_DRIVER_H
+#define DMA_DRIVER_H
+
+#include <systemc.h>
+
+#ifndef __SYNTHESIS__
+#define DWAIT(x) wait(x)
+#else
+#define DWAIT(x)
+#endif
+
+typedef struct _DATA {
+  sc_uint<32> data;
+  bool tlast;
+  inline friend ostream &operator<<(ostream &os, const _DATA &v) {
+    cout << "data&colon; " << v.data << " tlast: " << v.tlast;
+    return os;
+  }
+} DATA;
+
+SC_MODULE(DMA_DRIVER) {
+  sc_in<bool> clock;
+  sc_in<bool> reset;
+  sc_fifo_in<DATA> dout1;
+  sc_fifo_out<DATA> din1;
+  bool send;
+  bool recv;
+
+  void DMA_MMS2() {
+    while (1) {
+      while (!send)
+        wait();
+      for (int i = 0; i < input_len; i++) {
+        int d = DMA_input_buffer[i + input_offset];
+        bool tlast = (i+1 == input_len);
+        din1.write({d, tlast});
+        wait();
+      }
+      send = false;
+      wait();
+      sc_pause();
+      wait();
+    }
+  };
+
+  void DMA_S2MM() {
+    while (1) {
+      while (!recv)
+        wait();
+      bool last = false;
+      int i = 0;
+      do {
+        DATA d = dout1.read();
+        while (i >= output_len)
+          wait();
+        last = d.tlast;
+        DMA_output_buffer[output_offset + i++] = d.data;
+        wait();
+      } while (!last);
+      output_len = i;
+      recv = false;
+      // To ensure wait_send() does not evoke the sc_pause
+      while (send)
+        wait(2);
+      sc_pause();
+      wait();
+    }
+  };
+
+  SC_HAS_PROCESS(DMA_DRIVER);
+
+  DMA_DRIVER(sc_module_name name_) : sc_module(name_) {
+    SC_CTHREAD(DMA_MMS2, clock.pos());
+    reset_signal_is(reset, true);
+
+    SC_CTHREAD(DMA_S2MM, clock.pos());
+    reset_signal_is(reset, true);
+  }
+
+  int *DMA_input_buffer;
+  int *DMA_output_buffer;
+
+  // TODO: input_length = Number of elements * (sizeof(elements)/32)
+  int input_len;
+  int input_offset;
+
+  int output_len;
+  int output_offset;
+};
+
+#endif
\ No newline at end of file
diff --git a/include/soda/ExecutionEngine/axi/accelerators/dma_engine_v2.sc.h b/include/soda/ExecutionEngine/axi/accelerators/dma_engine_v2.sc.h
new file mode 100644
index 0000000..3bed052
--- /dev/null
+++ b/include/soda/ExecutionEngine/axi/accelerators/dma_engine_v2.sc.h
@@ -0,0 +1,116 @@
+#ifndef DMA_DRIVER_H
+#define DMA_DRIVER_H
+
+#include <systemc.h>
+
+#ifndef __SYNTHESIS__
+#define DWAIT(x) wait(x)
+#else
+#define DWAIT(x)
+#endif
+
+typedef struct _DATA {
+  sc_uint<32> data;
+  bool tlast;
+  inline friend ostream &operator<<(ostream &os, const _DATA &v) {
+    cout << "data&colon; " << v.data << " tlast: " << v.tlast;
+    return os;
+  }
+} DATA;
+
+SC_MODULE(DMA_DRIVER) {
+  sc_in<bool> clock;
+  sc_in<bool> reset;
+  sc_fifo_in<DATA> dout1;
+  sc_fifo_out<DATA> din1;
+  bool send;
+  bool recv;
+
+  void DMA_MMS2() {
+    while (1) {
+
+      while (!send)
+        wait();
+      
+      int send_len = input_len * isize;
+      for (int i = 0; i < send_len; i++) {
+        sc_uint<32> d;
+        d.range(7, 0) = DMA_input_buffer[(input_offset * isize) + i++];
+        if (isize > 1 && i < send_len)
+          d.range(15, 8) = DMA_input_buffer[(input_offset * isize) + i++];
+        if (isize > 2 && i < send_len)
+          d.range(23, 16) = DMA_input_buffer[(input_offset * isize) + i++];
+        if (isize > 3 && i < send_len)
+          d.range(31, 24) = DMA_input_buffer[(input_offset * isize) + i++];
+        wait();
+        din1.write({d, 1});
+        wait();
+      }
+
+      send = false;
+      wait();
+      sc_pause();
+      wait();
+    }
+  };
+
+  void DMA_S2MM() {
+    while (1) {
+      while (!recv)
+        wait();
+      bool last = false;
+      int i = 0;
+
+      do {
+        DATA d = dout1.read();
+        int recv_len = output_len * osize;
+        while (i >= recv_len)
+          wait();
+        last = d.tlast;
+        DMA_output_buffer[(output_offset * osize) + i++] = d.data.range(7, 0);
+        if (osize > 1)
+          DMA_output_buffer[(output_offset * osize) + i++] =
+              d.data.range(15, 8);
+        if (osize > 2)
+          DMA_output_buffer[(output_offset * osize) + i++] =
+              d.data.range(23, 16);
+        if (osize > 3)
+          DMA_output_buffer[(output_offset * osize) + i++] =
+              d.data.range(31, 24);
+        wait();
+      } while (!last);
+
+      recv_len = i;
+      recv = false;
+      // To ensure wait_send() does not evoke the sc_pause
+      while (send)
+        wait(2);
+      sc_pause();
+      wait();
+    }
+  };
+
+  SC_HAS_PROCESS(DMA_DRIVER);
+
+  DMA_DRIVER(sc_module_name name_) : sc_module(name_) {
+    SC_CTHREAD(DMA_MMS2, clock.pos());
+    reset_signal_is(reset, true);
+
+    SC_CTHREAD(DMA_S2MM, clock.pos());
+    reset_signal_is(reset, true);
+  }
+
+  char *DMA_input_buffer;
+  char *DMA_output_buffer;
+
+  // length = Number of elements
+  unsigned int input_len;
+  unsigned int input_offset;
+  unsigned int isize;
+
+  unsigned int output_len;
+  unsigned int output_offset;
+  unsigned int osize;
+};
+
+#endif
\ No newline at end of file
diff --git a/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v1/accelerator.sc.h b/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v1/accelerator.sc.h
new file mode 100644
index 0000000..d62bd5b
--- /dev/null
+++ b/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v1/accelerator.sc.h
@@ -0,0 +1,238 @@
+#ifndef ACC_H
+#define ACC_H
+
+#include "../dma_engine.sc.h"
+
+
+#ifndef __SYNTHESIS__
+#define DWAIT(x) wait(x)
+#else
+#define DWAIT(x)
+#endif
+
+#define ACCNAME MM_4x4v1
+
+#define M 4
+#define N 4
+#define K 4
+
+#ifdef VERBOSE_ACC
+#define ALOG(x) std::cout << x << std::endl
+#else
+#define ALOG(x)
+#endif
+
+
+SC_MODULE(ACCNAME) {
+  sc_in<bool> clock;
+  sc_in<bool> reset;
+  sc_int<32> inputs[M][K];
+  sc_int<32> weights[K][N];
+  sc_int<32> outputs[M][N];
+  sc_fifo_in<DATA> din1;
+  sc_fifo_out<DATA> dout1;
+
+  // Debug variables
+  int process_blocks;
+  int read_A_len;
+  int read_B_len;
+  int compute_C_len;
+  int send_C_len;
+  bool verbose = true;
+
+#ifndef __SYNTHESIS__
+  sc_signal<bool, SC_MANY_WRITERS> compute;
+  sc_signal<bool, SC_MANY_WRITERS> send;
+#else
+  sc_signal<bool> compute;
+  sc_signal<bool> send;
+#endif
+
+  void Recv();
+
+  void Compute();
+
+  void Send();
+
+  void print_profile();
+
+  int mul_int32(int, int);
+
+  SC_HAS_PROCESS(ACCNAME);
+
+  ACCNAME(sc_module_name name_) : sc_module(name_) {
+    SC_CTHREAD(Recv, clock.pos());
+    reset_signal_is(reset, true);
+
+    SC_CTHREAD(Compute, clock.pos());
+    reset_signal_is(reset, true);
+
+    SC_CTHREAD(Send, clock.pos());
+    reset_signal_is(reset, true);
+
+    process_blocks = 0;
+    read_A_len = 0;
+    read_B_len = 0;
+    compute_C_len = 0;
+    send_C_len = 0;
+    verbose = false;
+
+    // #pragma HLS RESOURCE variable=din1 core=AXI4Stream metadata="-bus_bundle
+    // S_AXIS_DATA1" port_map={{din1_0 TDATA} {din1_1 TLAST}} #pragma HLS
+    // RESOURCE variable=dout1 core=AXI4Stream metadata="-bus_bundle
+    // M_AXIS_DATA1" port_map={{dout1_0 TDATA} {dout1_1 TLAST}} #pragma HLS
+    // RESET variable=reset
+  }
+};
+
+template <typename Integer>
+void accelerator_dma_connect(ACCNAME *acc, DMA_DRIVER *dmad,
+                             int _dma_input_buffer_size,
+                             int _dma_output_buffer_size) {
+
+  static sc_clock clk_fast("ClkFast", 1, SC_NS);
+  static sc_signal<bool> sig_reset;
+  static sc_fifo<DATA> din1("din1_fifo", _dma_input_buffer_size);
+  static sc_fifo<DATA> dout1("dout1_fifo", _dma_output_buffer_size);
+
+  acc->clock(clk_fast);
+  acc->reset(sig_reset);
+  acc->dout1(dout1);
+  acc->din1(din1);
+
+  dmad->clock(clk_fast);
+  dmad->reset(sig_reset);
+  dmad->dout1(dout1);
+  dmad->din1(din1);
+}
+
+void ACCNAME::print_profile() {
+  ALOG("++++++++++++++++++++++++++++++++++++++++" );
+  ALOG("Read A data_len: " << read_A_len);
+  ALOG("Read B data_len: " << read_B_len);
+  ALOG("MACs count: " << compute_C_len);
+  ALOG("Send C data_len: " << send_C_len);
+  ALOG("++++++++++++++++++++++++++++++++++++++++" );
+  ALOG("Executed with :" << __FILE__ );
+  ALOG("- - - - - - - - - - - - - - - - - - - - ");
+}
+
+void ACCNAME::Recv() {
+  wait();
+  while (1) {
+    while (compute)
+      wait();
+
+    for (int m = 0; m < M; m++) {
+      // #pragma HLS pipeline
+      for (int k = 0; k < K; k++) {
+        inputs[m][k] = din1.read().data;
+        read_A_len++;
+        DWAIT();
+      }
+    }
+
+    for (int k = 0; k < K; k++) {
+      // #pragma HLS pipeline
+      for (int n = 0; n < N; n++) {
+        weights[k][n] = din1.read().data;
+        read_B_len++;
+        DWAIT();
+      }
+    }
+
+    // DEBUG ONLY
+    if (verbose) {
+      cout << "=========================" << endl;
+      cout << "BLOCK: " << process_blocks++ << endl;
+      cout << "=========================" << endl;
+      for (int m = 0; m < M; m++) {
+        for (int k = 0; k < K; k++)
+          cout << inputs[m][k] << ",";
+        cout << endl;
+      }
+      cout << "=========================" << endl;
+      for (int k = 0; k < K; k++) {
+        for (int n = 0; n < N; n++)
+          cout << weights[k][n] << ",";
+        cout << endl;
+      }
+      cout << "=========================" << endl;
+    }
+    // DEBUG ONLY
+
+    wait();
+    compute.write(true);
+    wait();
+  }
+}
+
+void ACCNAME::Compute() {
+  wait();
+  while (1) {
+    while (!compute)
+      wait();
+
+    for (int m = 0; m < M; m++) {
+      // #pragma HLS pipeline
+      for (int n = 0; n < N; n++) {
+        int acc = 0;
+        for (int k = 0; k < K; k++) {
+          int x = inputs[m][k];
+          int y = weights[k][n];
+          acc += mul_int32(x, y);
+          compute_C_len++;
+        }
+        outputs[m][n] = acc;
+      }
+    }
+
+    // DEBUG ONLY
+    if (verbose) {
+      cout << "=========================" << endl;
+      cout << "Output: " << process_blocks - 1 << endl;
+      cout << "=========================" << endl;
+      cout << "=========================" << endl;
+      for (int m = 0; m < M; m++) {
+        for (int n = 0; n < N; n++)
+          cout << outputs[m][n] << ",";
+        cout << endl;
+      }
+      cout << "=========================" << endl;
+    }
+    // DEBUG ONLY
+
+    wait();
+    compute.write(false);
+    send.write(true);
+    wait();
+  }
+}
+
+void ACCNAME::Send() {
+  wait();
+  while (1) {
+    while (!send)
+      wait();
+
+    for (int m = 0; m < M; m++) {
+      // #pragma HLS pipeline
+      for (int n = 0; n < N; n++) {
+        DATA d;
+        d.tlast = false;
+        if (m == M - 1 && n == N - 1)
+          d.tlast = true;
+        d.data = outputs[m][n];
+        dout1.write(d);
+        send_C_len++;
+        DWAIT();
+      }
+    }
+    send.write(false);
+    wait();
+  }
+}
+
+int ACCNAME::mul_int32(int x, int y) { return x * y; }
+
+#endif
diff --git a/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v1/mm_4x4_v1.json b/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v1/mm_4x4_v1.json
new file mode 100644
index 0000000..e997d57
--- /dev/null
+++ b/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v1/mm_4x4_v1.json
@@ -0,0 +1,120 @@
+{
+    "name": "MM_4x4_v1",
+    "version": "1.0",
+    "description": "MM Accelerator",
+
+    "memory_layout": {
+        "#A_Buffer": {
+            "size": 16,
+            "data_type": "int32"
+        },
+        "#B_Buffer": {
+            "size": 16,
+            "data_type": "int32"
+        },
+        "#C_Buffer": {
+            "size": 16,
+            "data_type": "int32"
+        }
+    },
+    
+    "dma_fifo": {
+        "din": {
+            "id": "0",
+            "data_type": "int32",
+            "read": true,
+            "write": false
+        },
+        "dout": {
+            "id": "1",
+            "data_type": "int32",
+            "read": false,
+            "write": true
+        }
+    },
+
+    "kernels": {
+        "4x4_MM": {
+            "id": 0,
+            "description": "4x4 matrix multiplication",
+            "compute": "C += A * B",
+            "tile_info": {
+                "tile_dims": {
+                    "#N": 4,
+                    "#M": 4,
+                    "#K": 4
+                },
+                "A": {
+                    "associated_buffer": "#A_Buffer",
+                    "read": true,
+                    "write": false,
+                    "default_offset": 0,
+                    "shape": [
+                        "#N",
+                        "#K"
+                    ]
+                },
+                "B": {
+                    "associated_buffer": "#B_Buffer",
+                    "read": true,
+                    "write": false,
+                    "default_offset": 0,
+                    "shape": [
+                        "#M",
+                        "#K"
+                    ]
+                },
+                "C": {
+                    "associated_buffer": "#C_Buffer",
+                    "read": true,
+                    "write": true,
+                    "default_offset": 0,
+                    "shape": [
+                        "#N",
+                        "#M"
+                    ]
+                }
+            }
+        }
+    },
+    
+    "ISA": {
+        "instruction_format": {
+            "opcode_length": 0,
+            "op_args": 0
+        },
+        "opcodes": {
+            "-": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#A_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                },
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#B_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                },
+                {
+                    "COMPUTE": {
+                        "kernel_id": 0
+                    }
+                },
+                {
+                    "SEND": {
+                        "dma_fifo_id": 1,
+                        "buffer": "#C_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                }
+            ]
+        }
+    }
+}
\ No newline at end of file
diff --git a/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v2/accelerator.sc.h b/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v2/accelerator.sc.h
new file mode 100644
index 0000000..5570bf6
--- /dev/null
+++ b/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v2/accelerator.sc.h
@@ -0,0 +1,270 @@
+#ifndef ACC_H
+#define ACC_H
+
+#include "../dma_engine.sc.h"
+
+#ifndef __SYNTHESIS__
+#define DWAIT(x) wait(x)
+#else
+#define DWAIT(x)
+#endif
+
+#define M 4
+#define N 4
+#define K 4
+
+#define ACCNAME MM_4x4v2
+
+#ifdef VERBOSE_ACC
+#define ALOG(x) std::cout << x << std::endl
+#else
+#define ALOG(x)
+#endif
+
+// OP-Code Stuct
+// 000 : 0 = NOP;
+// 001 : 1 = read_A;
+// 010 : 2 = read_B;
+// 011 : 3 = read_A -> read_B;
+// 100 : 4 = compute_C;
+// 101 : 5 = read_A -> compute_C;
+// 110 : 6 = read_B -> compute_C;
+// 111 : 7 = read_A -> read_B -> compute_C;
+
+struct opcode {
+  unsigned int packet;
+  bool read_A;
+  bool read_B;
+  bool compute_C;
+
+  opcode(sc_uint<32> _packet) {
+    // ALOG("OPCODE: " << _packet);
+    // ALOG("Time: " << sc_time_stamp());
+    packet = _packet;
+    read_A = _packet.range(0, 0);
+    read_B = _packet.range(1, 1);
+    compute_C = _packet.range(2, 2);
+  }
+};
+
+SC_MODULE(ACCNAME) {
+  sc_in<bool> clock;
+  sc_in<bool> reset;
+  sc_int<32> inputs[M][K];
+  sc_int<32> weights[K][N];
+  sc_int<32> outputs[M][N];
+  sc_fifo_in<DATA> din1;
+  sc_fifo_out<DATA> dout1;
+
+  // Debug variables
+  int process_blocks;
+  int read_A_len;
+  int read_B_len;
+  int compute_C_len;
+  int send_C_len;
+  bool verbose;
+
+#ifndef __SYNTHESIS__
+  sc_signal<bool, SC_MANY_WRITERS> compute;
+  sc_signal<bool, SC_MANY_WRITERS> send;
+#else
+  sc_signal<bool> compute;
+  sc_signal<bool> send;
+#endif
+
+  void Recv();
+
+  void Compute();
+
+  void Send();
+
+  void print_profile();
+
+  int mul_int32(int, int);
+
+  SC_HAS_PROCESS(ACCNAME);
+
+  ACCNAME(sc_module_name name_) : sc_module(name_) {
+    SC_CTHREAD(Recv, clock.pos());
+    reset_signal_is(reset, true);
+
+    SC_CTHREAD(Compute, clock.pos());
+    reset_signal_is(reset, true);
+
+    SC_CTHREAD(Send, clock.pos());
+    reset_signal_is(reset, true);
+
+    process_blocks = 0;
+    read_A_len = 0;
+    read_B_len = 0;
+    compute_C_len = 0;
+    send_C_len = 0;
+    verbose = false;
+
+    // clang-format off
+    // #pragma HLS RESOURCE variable=din1 core=AXI4Stream metadata="-bus_bundle S_AXIS_DATA1" port_map={{din1_0 TDATA} {din1_1 TLAST}}
+    // #pragma HLS RESOURCE variable=dout1 core=AXI4Stream metadata="-bus_bundle M_AXIS_DATA1" port_map={{dout1_0 TDATA} {dout1_1 TLAST}}
+    // #pragma HLS RESET variable=reset
+
+    // #pragma HLS array_partition variable=inputs complete dim=2
+    // #pragma HLS array_partition variable=weights complete dim=0
+    // #pragma HLS array_partition variable=outputs complete dim=2
+    // clang-format on
+  }
+};
+
+template <typename Integer>
+void accelerator_dma_connect(ACCNAME *acc, DMA_DRIVER *dmad,
+                             int _dma_input_buffer_size,
+                             int _dma_output_buffer_size) {
+
+  static sc_clock clk_fast("ClkFast", 1, SC_NS);
+  static sc_signal<bool> sig_reset;
+  static sc_fifo<DATA> din1("din1_fifo", _dma_input_buffer_size);
+  static sc_fifo<DATA> dout1("dout1_fifo", _dma_output_buffer_size);
+
+  acc->clock(clk_fast);
+  acc->reset(sig_reset);
+  acc->dout1(dout1);
+  acc->din1(din1);
+
+  dmad->clock(clk_fast);
+  dmad->reset(sig_reset);
+  dmad->dout1(dout1);
+  dmad->din1(din1);
+}
+
+void ACCNAME::print_profile() {
+  ALOG("++++++++++++++++++++++++++++++++++++++++");
+  ALOG("Read A data_len: " << read_A_len);
+  ALOG("Read B data_len: " << read_B_len);
+  ALOG("MACs count: " << compute_C_len);
+  ALOG("Send C data_len: " << send_C_len);
+  ALOG("++++++++++++++++++++++++++++++++++++++++");
+  ALOG("Executed with :" << __FILE__);
+  ALOG("- - - - - - - - - - - - - - - - - - - - ");
+}
+
+void ACCNAME::Recv() {
+  wait();
+  while (1) {
+    while (compute)
+      wait();
+
+    opcode packet(din1.read().data);
+
+    if (packet.read_A) {
+      for (int m = 0; m < M; m++) {
+        // #pragma HLS pipeline
+        for (int k = 0; k < K; k++) {
+          inputs[m][k] = din1.read().data;
+          read_A_len++;
+        }
+      }
+      if (verbose) {
+        cout << "=========================" << endl;
+        cout << "Read BLOCK A: " << read_A_len++ << endl;
+        cout << "=========================" << endl;
+        for (int m = 0; m < M; m++) {
+          for (int k = 0; k < K; k++)
+            cout << inputs[m][k] << ",";
+          cout << endl;
+        }
+        cout << "=========================" << endl;
+      }
+    }
+
+    if (packet.read_B) {
+      for (int k = 0; k < K; k++) {
+        // #pragma HLS pipeline
+        for (int n = 0; n < N; n++) {
+          weights[k][n] = din1.read().data;
+          read_B_len++;
+        }
+      }
+      if (verbose) {
+        cout << "=========================" << endl;
+        cout << "Read BLOCK B: " << read_B_len++ << endl;
+        cout << "=========================" << endl;
+        for (int k = 0; k < K; k++) {
+          for (int n = 0; n < N; n++)
+            cout << weights[k][n] << ",";
+          cout << endl;
+        }
+        cout << "=========================" << endl;
+      }
+    }
+
+    if (packet.compute_C) {
+      wait();
+      compute.write(true);
+    }
+    wait();
+  }
+}
+
+void ACCNAME::Compute() {
+  wait();
+  while (1) {
+    while (!compute)
+      wait();
+
+    for (int m = 0; m < M; m++) {
+      // #pragma HLS pipeline
+      for (int n = 0; n < N; n++) {
+        int acc = 0;
+        for (int k = 0; k < K; k++) {
+          int x = inputs[m][k];
+          int y = weights[k][n];
+          acc += mul_int32(x, y);
+          compute_C_len++;
+        }
+        outputs[m][n] = acc;
+      }
+    }
+
+    if (verbose) {
+      cout << "=========================" << endl;
+      cout << "Compute BLOCK C: " << compute_C_len++ << endl;
+      cout << "=========================" << endl;
+      for (int m = 0; m < M; m++) {
+        for (int n = 0; n < N; n++)
+          cout << outputs[m][n] << ",";
+        cout << endl;
+      }
+      cout << "=========================" << endl;
+    }
+    wait();
+    compute.write(false);
+    send.write(true);
+    wait();
+  }
+}
+
+int ACCNAME::mul_int32(int x, int y) { return x * y; }
+
+void ACCNAME::Send() {
+  wait();
+  while (1) {
+    while (!send)
+      wait();
+
+    for (int m = 0; m < M; m++) {
+      // #pragma HLS pipeline
+      for (int n = 0; n < N; n++) {
+        DATA d;
+        d.tlast = false;
+        if (m == M - 1 && n == N - 1)
+          d.tlast = true;
+        d.data = outputs[m][n];
+        dout1.write(d);
+        send_C_len++;
+        DWAIT();
+      }
+    }
+    send.write(false);
+    wait();
+  }
+}
+
+#endif
diff --git a/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v2/mm_4x4_v2.json b/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v2/mm_4x4_v2.json
new file mode 100644
index 0000000..e469ab4
--- /dev/null
+++ b/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v2/mm_4x4_v2.json
@@ -0,0 +1,218 @@
+{
+    "name": "MM_4x4_v2",
+    "version": "1.0",
+    "description": "MM Accelerator",
+    "memory_layout": {
+        "#A_Buffer": {
+            "size": 16,
+            "data_type": "int32"
+        },
+        "#B_Buffer": {
+            "size": 16,
+            "data_type": "int32"
+        },
+        "#C_Buffer": {
+            "size": 16,
+            "data_type": "int32"
+        }
+    },
+    "dma_fifo": {
+        "din": {
+            "id": "0",
+            "data_type": "int32",
+            "read": true,
+            "write": false
+        },
+        "dout": {
+            "id": "1",
+            "data_type": "int32",
+            "read": false,
+            "write": true
+        }
+    },
+    "kernels": {
+        "4x4_MM": {
+            "id": 0,
+            "description": "4x4 matrix multiplication",
+            "compute": "C += A * B",
+            "tile_info": {
+                "tile_dims": {
+                    "#N": 4,
+                    "#M": 4,
+                    "#K": 4
+                },
+                "A": {
+                    "associated_buffer": "#A_Buffer",
+                    "read": true,
+                    "write": false,
+                    "default_offset": 0,
+                    "shape": [
+                        "#N",
+                        "#K"
+                    ]
+                },
+                "B": {
+                    "associated_buffer": "#B_Buffer",
+                    "read": true,
+                    "write": false,
+                    "default_offset": 0,
+                    "shape": [
+                        "#M",
+                        "#K"
+                    ],
+                    "stationary": true
+                },
+                "C": {
+                    "associated_buffer": "#C_Buffer",
+                    "read": true,
+                    "write": true,
+                    "default_offset": 0,
+                    "shape": [
+                        "#N",
+                        "#M"
+                    ]
+                }
+            }
+        }
+    },
+    "ISA": {
+        "instruction_format": {
+            "opcode_length": 32,
+            "op_args": 0
+        },
+        "opcodes": {
+            "-": [],
+            "0": [],
+            "1": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#A_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                }
+            ],
+            "2": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#B_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                }
+            ],
+            "3": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#A_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                },
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#B_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                }
+            ],
+            "4": [
+                {
+                    "COMPUTE": {
+                        "kernel_id": 0
+                    }
+                },
+                {
+                    "SEND": {
+                        "dma_fifo_id": 1,
+                        "buffer": "#C_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                }
+            ],
+            "5": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#A_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                },
+                {
+                    "COMPUTE": {
+                        "kernel_id": 0
+                    }
+                },
+                {
+                    "SEND": {
+                        "dma_fifo_id": 1,
+                        "buffer": "#C_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                }
+            ],
+            "6": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#B_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                },
+                {
+                    "COMPUTE": {
+                        "kernel_id": 0
+                    }
+                },
+                {
+                    "SEND": {
+                        "dma_fifo_id": 1,
+                        "buffer": "#C_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                }
+            ],
+            "7": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#A_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                },
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#B_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                },
+                {
+                    "COMPUTE": {
+                        "kernel_id": 0
+                    }
+                },
+                {
+                    "SEND": {
+                        "dma_fifo_id": 1,
+                        "buffer": "#C_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                }
+            ]
+        }
+    }
+}
\ No newline at end of file
diff --git a/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v3/accelerator.sc.h b/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v3/accelerator.sc.h
new file mode 100644
index 0000000..3e52749
--- /dev/null
+++ b/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v3/accelerator.sc.h
@@ -0,0 +1,299 @@
+#ifndef ACC_H
+#define ACC_H
+
+#include "../dma_engine.sc.h"
+
+#ifndef __SYNTHESIS__
+#define DWAIT(x) wait(x)
+#else
+#define DWAIT(x)
+#endif
+
+#define M 4
+#define N 4
+#define K 4
+
+#define ACCNAME MM_4x4v3
+
+#ifdef VERBOSE_ACC
+#define ALOG(x) std::cout << x << std::endl
+#else
+#define ALOG(x)
+#endif
+
+// OP-Code Stuct
+// 0000 : 0 = NOP;
+// 0001 : 1 = read_A;
+// 0010 : 2 = read_B;
+// 0011 : 3 = read_A -> read_B;
+// 0100 : 4 = compute_C;
+// 0101 : 5 = read_A -> compute_C;
+// 0110 : 6 = read_B -> compute_C;
+// 0111 : 7 = read_A -> read_B -> compute_C;
+
+// 1000 : 8 = send_C;
+// 1001 : 9 = read_A -> send_C;
+// 1010 : 10 = read_B -> send_C;
+// 1011 : 11 = read_A -> read_B -> send_C;
+// 1100 : 12 = compute_C -> send_C;
+// 1101 : 13 = read_A -> compute_C -> send_C;
+// 1110 : 14 = read_B -> compute_C -> send_C;
+// 1111 : 15 = read_A -> read_B -> compute_C -> send_C;
+
+struct opcode {
+  unsigned int packet;
+  bool read_A;
+  bool read_B;
+  bool compute_C;
+  bool send_C;
+
+  opcode(sc_uint<32> _packet) {
+    // ALOG("OPCODE: " << _packet);
+    // ALOG("Time: " << sc_time_stamp());
+    packet = _packet;
+    read_A = _packet.range(0, 0);
+    read_B = _packet.range(1, 1);
+    compute_C = _packet.range(2, 2);
+    send_C = _packet.range(3, 3);
+  }
+};
+
+SC_MODULE(ACCNAME) {
+  sc_in<bool> clock;
+  sc_in<bool> reset;
+  sc_int<32> inputs[M][K];
+  sc_int<32> weights[K][N];
+  sc_int<32> outputs[M][N];
+  sc_fifo_in<DATA> din1;
+  sc_fifo_out<DATA> dout1;
+
+  // Debug variables
+  int process_blocks;
+  int read_A_len;
+  int read_B_len;
+  int compute_C_len;
+  int send_C_len;
+  bool verbose;
+
+#ifndef __SYNTHESIS__
+  sc_signal<bool, SC_MANY_WRITERS> compute;
+  sc_signal<bool, SC_MANY_WRITERS> send;
+#else
+  sc_signal<bool> compute;
+  sc_signal<bool> send;
+#endif
+
+  void Recv();
+
+  void Compute();
+
+  void Send();
+
+  void print_profile();
+
+  int mul_int32(int, int);
+
+  SC_HAS_PROCESS(ACCNAME);
+
+  ACCNAME(sc_module_name name_) : sc_module(name_) {
+    SC_CTHREAD(Recv, clock.pos());
+    reset_signal_is(reset, true);
+
+    SC_CTHREAD(Compute, clock.pos());
+    reset_signal_is(reset, true);
+
+    SC_CTHREAD(Send, clock.pos());
+    reset_signal_is(reset, true);
+
+    process_blocks = 0;
+    read_A_len = 0;
+    read_B_len = 0;
+    compute_C_len = 0;
+    send_C_len = 0;
+    verbose = false;
+
+    // clang-format off
+    // #pragma HLS RESOURCE variable=din1 core=AXI4Stream metadata="-bus_bundle S_AXIS_DATA1" port_map={{din1_0 TDATA} {din1_1 TLAST}}
+    // #pragma HLS RESOURCE variable=dout1 core=AXI4Stream metadata="-bus_bundle M_AXIS_DATA1" port_map={{dout1_0 TDATA} {dout1_1 TLAST}}
+    // #pragma HLS RESET variable=reset
+
+    // #pragma HLS array_partition variable=inputs complete dim=2
+    // #pragma HLS array_partition variable=weights complete dim=0
+    // #pragma HLS array_partition variable=outputs complete dim=2
+    // clang-format on
+  }
+};
+
+template <typename Integer>
+void accelerator_dma_connect(ACCNAME *acc, DMA_DRIVER *dmad,
+                             int _dma_input_buffer_size,
+                             int _dma_output_buffer_size) {
+
+  static sc_clock clk_fast("ClkFast", 1, SC_NS);
+  static sc_signal<bool> sig_reset;
+  static sc_fifo<DATA> din1("din1_fifo", _dma_input_buffer_size);
+  static sc_fifo<DATA> dout1("dout1_fifo", _dma_output_buffer_size);
+
+  acc->clock(clk_fast);
+  acc->reset(sig_reset);
+  acc->dout1(dout1);
+  acc->din1(din1);
+
+  dmad->clock(clk_fast);
+  dmad->reset(sig_reset);
+  dmad->dout1(dout1);
+  dmad->din1(din1);
+}
+
+void ACCNAME::print_profile() {
+  ALOG("++++++++++++++++++++++++++++++++++++++++");
+  ALOG("Read A data_len: " << read_A_len);
+  ALOG("Read B data_len: " << read_B_len);
+  ALOG("MACs count: " << compute_C_len);
+  ALOG("Send C data_len: " << send_C_len);
+  ALOG("++++++++++++++++++++++++++++++++++++++++");
+  ALOG("Executed with :" << __FILE__);
+  ALOG("- - - - - - - - - - - - - - - - - - - - ");
+}
+
+void ACCNAME::Recv() {
+  wait();
+  while (1) {
+    opcode packet(din1.read().data);
+
+    if (packet.read_A) {
+      for (int m = 0; m < M; m++) {
+        // #pragma HLS pipeline
+        for (int k = 0; k < K; k++) {
+          inputs[m][k] = din1.read().data;
+          read_A_len++;
+        }
+      }
+      if (verbose) {
+        cout << "=========================" << endl;
+        cout << "Read BLOCK A: " << read_A_len++ << endl;
+        cout << "=========================" << endl;
+        for (int m = 0; m < M; m++) {
+          for (int k = 0; k < K; k++)
+            cout << inputs[m][k] << ",";
+          cout << endl;
+        }
+        cout << "=========================" << endl;
+      }
+    }
+
+    if (packet.read_B) {
+      for (int k = 0; k < K; k++) {
+        // #pragma HLS pipeline
+        for (int n = 0; n < N; n++) {
+          weights[k][n] = din1.read().data;
+          read_B_len++;
+        }
+      }
+      if (verbose) {
+        cout << "=========================" << endl;
+        cout << "Read BLOCK B: " << read_B_len++ << endl;
+        cout << "=========================" << endl;
+        for (int k = 0; k < K; k++) {
+          for (int n = 0; n < N; n++)
+            cout << weights[k][n] << ",";
+          cout << endl;
+        }
+        cout << "=========================" << endl;
+      }
+    }
+
+    // Computes C if true
+    if (packet.compute_C) {
+      wait();
+      compute.write(true);
+    }
+
+    while (compute)
+      wait();
+
+    // Sends then clears C if true
+    if (packet.send_C) {
+      wait();
+      send.write(true);
+    }
+
+    while (send)
+      wait();
+
+    wait();
+  }
+}
+
+void ACCNAME::Compute() {
+  wait();
+  while (1) {
+    while (!compute)
+      wait();
+
+    for (int m = 0; m < M; m++) {
+      // #pragma HLS pipeline
+      for (int n = 0; n < N; n++) {
+        int acc = 0;
+        for (int k = 0; k < K; k++) {
+          int x = inputs[m][k];
+          int y = weights[k][n];
+          acc += mul_int32(x, y);
+          compute_C_len++;
+        }
+        outputs[m][n] += acc;
+      }
+    }
+    wait();
+    compute.write(false);
+    wait();
+  }
+}
+
+int ACCNAME::mul_int32(int x, int y) { return x * y; }
+
+void ACCNAME::Send() {
+  wait();
+  while (1) {
+    while (!send)
+      wait();
+
+    for (int m = 0; m < M; m++) {
+      // #pragma HLS pipeline
+      for (int n = 0; n < N; n++) {
+        DATA d;
+        d.tlast = false;
+        if (m == M - 1 && n == N - 1)
+          d.tlast = true;
+        d.data = outputs[m][n];
+        dout1.write(d);
+        send_C_len++;
+        DWAIT();
+      }
+    }
+
+    if (verbose) {
+      cout << "=========================" << endl;
+      cout << "Compute BLOCK C: " << compute_C_len++ << endl;
+      cout << "=========================" << endl;
+      for (int m = 0; m < M; m++) {
+        for (int n = 0; n < N; n++)
+          cout << outputs[m][n] << ",";
+        cout << endl;
+      }
+      cout << "=========================" << endl;
+    }
+
+    for (int m = 0; m < M; m++) {
+      // #pragma HLS unroll
+      for (int n = 0; n < N; n++) {
+        // #pragma HLS unroll
+        outputs[m][n] = 0; // Clears after sends
+      }
+    }
+    send.write(false);
+    wait();
+  }
+}
+
+#endif
diff --git a/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v3/mm_4x4_v3.json b/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v3/mm_4x4_v3.json
new file mode 100644
index 0000000..83c0a89
--- /dev/null
+++ b/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v3/mm_4x4_v3.json
@@ -0,0 +1,352 @@
+{
+    "name": "MM_4x4_v3",
+    "version": "1.0",
+    "description": "MM Accelerator",
+    "memory_layout": {
+        "#A_Buffer": {
+            "size": 16,
+            "data_type": "int32"
+        },
+        "#B_Buffer": {
+            "size": 16,
+            "data_type": "int32"
+        },
+        "#C_Buffer": {
+            "size": 16,
+            "data_type": "int32"
+        }
+    },
+    "dma_fifo": {
+        "din": {
+            "id": "0",
+            "data_type": "int32",
+            "read": true,
+            "write": false
+        },
+        "dout": {
+            "id": "1",
+            "data_type": "int32",
+            "read": false,
+            "write": true
+        }
+    },
+    "kernels": {
+        "4x4_MM": {
+            "id": 0,
+            "description": "4x4 matrix multiplication",
+            "compute": "C += A * B",
+            "tile_info": {
+                "tile_dims": {
+                    "#N": 4,
+                    "#M": 4,
+                    "#K": 4
+                },
+                "A": {
+                    "associated_buffer": "#A_Buffer",
+                    "read": true,
+                    "write": false,
+                    "default_offset": 0,
+                    "shape": [
+                        "#N",
+                        "#K"
+                    ]
+                },
+                "B": {
+                    "associated_buffer": "#B_Buffer",
+                    "read": true,
+                    "write": false,
+                    "default_offset": 0,
+                    "shape": [
+                        "#M",
+                        "#K"
+                    ],
+                    "stationary": true
+                },
+                "C": {
+                    "associated_buffer": "#C_Buffer",
+                    "read": true,
+                    "write": true,
+                    "default_offset": 0,
+                    "shape": [
+                        "#N",
+                        "#M"
+                    ]
+                }
+            }
+        }
+    },
+    "ISA": {
+        "instruction_format": {
+            "opcode_length": 32,
+            "op_args": 0
+        },
+        "opcodes": {
+            "-": [],
+            "0": [],
+            "1": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#A_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                }
+            ],
+            "2": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#B_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                }
+            ],
+            "3": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#A_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                },
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#B_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                }
+            ],
+            "4": [
+                {
+                    "COMPUTE": {
+                        "kernel_id": 0
+                    }
+                }
+            ],
+            "5": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#A_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                },
+                {
+                    "COMPUTE": {
+                        "kernel_id": 0
+                    }
+                }
+            ],
+            "6": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#B_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                },
+                {
+                    "COMPUTE": {
+                        "kernel_id": 0
+                    }
+                }
+            ],
+            "7": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#A_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                },
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#B_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                },
+                {
+                    "COMPUTE": {
+                        "kernel_id": 0
+                    }
+                }
+            ],
+            "8": [
+                {
+                    "SEND": {
+                        "dma_fifo_id": 1,
+                        "buffer": "#C_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                }
+            ],
+            "9": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#A_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                },                
+                {
+                    "SEND": {
+                        "dma_fifo_id": 1,
+                        "buffer": "#C_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                }
+            ],
+            "10": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#B_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                },                
+                {
+                    "SEND": {
+                        "dma_fifo_id": 1,
+                        "buffer": "#C_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                }
+            ],
+            "11": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#A_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                },
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#B_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                },                
+                {
+                    "SEND": {
+                        "dma_fifo_id": 1,
+                        "buffer": "#C_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                }
+            ],
+            "12": [
+                {
+                    "COMPUTE": {
+                        "kernel_id": 0
+                    }
+                },
+                {
+                    "SEND": {
+                        "dma_fifo_id": 1,
+                        "buffer": "#C_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                }           
+            ],
+            "13": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#A_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                },
+                {
+                    "COMPUTE": {
+                        "kernel_id": 0
+                    }
+                },
+                {
+                    "SEND": {
+                        "dma_fifo_id": 1,
+                        "buffer": "#C_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                }
+            ],
+            "14": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#B_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                },
+                {
+                    "COMPUTE": {
+                        "kernel_id": 0
+                    }
+                },
+                {
+                    "SEND": {
+                        "dma_fifo_id": 1,
+                        "buffer": "#C_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                }
+            ],
+            "15": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#A_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                },
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#B_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                },
+                {
+                    "COMPUTE": {
+                        "kernel_id": 0
+                    }
+                },
+                {
+                    "SEND": {
+                        "dma_fifo_id": 1,
+                        "buffer": "#C_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                }
+            ]
+
+
+        }
+    }
+}
\ No newline at end of file
diff --git a/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v4/accelerator.sc.h b/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v4/accelerator.sc.h
new file mode 100644
index 0000000..c661acf
--- /dev/null
+++ b/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v4/accelerator.sc.h
@@ -0,0 +1,344 @@
+#ifndef ACC_H
+#define ACC_H
+
+#define PE_M 16
+#define PE_N 16
+#define PE_K 16
+
+#include "../dma_engine.sc.h"
+#define ACCNAME MM_4x4v4
+
+// #define VERBOSE_ACC
+#ifdef VERBOSE_ACC
+#define ALOG(x) std::cout << x << std::endl
+#else
+#define ALOG(x)
+#endif
+
+// OP-Code Stuct
+// 0000 : 0 = NOP;
+// 0001 : 1 = read_A;
+// 0010 : 2 = read_B;
+// 0011 : 3 = read_A -> read_B;
+// 0100 : 4 = compute_C;
+// 0101 : 5 = read_A -> compute_C;
+// 0110 : 6 = read_B -> compute_C;
+// 0111 : 7 = read_A -> read_B -> compute_C;
+
+// 1000 : 8 = send_C;
+// 1001 : 9 = read_A -> send_C;
+// 1010 : 10 = read_B -> send_C;
+// 1011 : 11 = read_A -> read_B -> send_C;
+// 1100 : 12 = compute_C -> send_C;
+// 1101 : 13 = read_A -> compute_C -> send_C;
+// 1110 : 14 = read_B -> compute_C -> send_C;
+// 1111 : 15 = read_A -> read_B -> compute_C -> send_C;
+
+#define su10 sc_uint<10>
+#define su12 sc_uint<12>
+// MAX M, N, K = 256
+struct opcode {
+  unsigned int packet;
+  bool read_A;
+  bool read_B;
+  bool compute_C;
+  bool send_C;
+
+  opcode(sc_uint<32> _packet) {
+    // ALOG("OPCODE: " << _packet);
+    // ALOG("Time: " << sc_time_stamp());
+    packet = _packet;
+    read_A = _packet.range(0, 0);
+    read_B = _packet.range(1, 1);
+    compute_C = _packet.range(2, 2);
+    send_C = _packet.range(3, 3);
+  }
+};
+
+struct code_extension {
+  su10 N;
+  su10 M;
+  su10 K;
+  su10 K16;
+  su10 N16;
+
+  code_extension(sc_uint<32> _packetA) {
+    M = _packetA.range(9, 0);
+    N = _packetA.range(19, 10);
+    K = _packetA.range(29, 20);
+    N16 = _packetA.range(19, 10) / PE_N;
+    K16 = _packetA.range(29, 20) / PE_K;
+    // ALOG("packetA: " << _packetA);
+    // ALOG("Time: " << sc_time_stamp());
+    // ALOG("N: " << N << ", M: " << M << ", K: " << K);
+    // cin.ignore();
+  }
+};
+
+SC_MODULE(ACCNAME) {
+  sc_in<bool> clock;
+  sc_in<bool> reset;
+  sc_int<32> A_buffer[256][16];
+  sc_int<32> B_buffer[256][16];
+  sc_int<32> C_buffer[256][16];
+  sc_fifo_in<DATA> din1;
+  sc_fifo_out<DATA> dout1;
+
+  // Debug variables
+  int process_blocks;
+  int read_A_len;
+  int read_B_len;
+  int compute_C_len;
+  int send_C_len;
+  bool verbose;
+
+#ifndef __SYNTHESIS__
+  sc_signal<bool, SC_MANY_WRITERS> compute;
+  sc_signal<bool, SC_MANY_WRITERS> send;
+#else
+  sc_signal<bool> compute;
+  sc_signal<bool> send;
+#endif
+
+  code_extension acc_args = code_extension(0);
+
+  void Recv();
+
+  void Compute(sc_int<32>[PE_M][PE_K], sc_int<32>[PE_K][PE_N],
+               sc_int<32>[PE_M][PE_N]);
+
+  void LoadA(sc_int<32>[PE_M][PE_K], su10, su10, su10);
+
+  void LoadB(sc_int<32>[PE_K][PE_N], su10, su10, su10);
+
+  void Store(sc_int<32>[PE_M][PE_N], su10, su10, su10);
+
+  void Schedule_Compute();
+
+  void Send();
+
+  void print_profile();
+
+  int mul_int32(int, int);
+
+  SC_HAS_PROCESS(ACCNAME);
+
+  ACCNAME(sc_module_name name_) : sc_module(name_) {
+    SC_CTHREAD(Recv, clock.pos());
+    reset_signal_is(reset, true);
+
+    SC_CTHREAD(Schedule_Compute, clock.pos());
+    reset_signal_is(reset, true);
+
+    SC_CTHREAD(Send, clock.pos());
+    reset_signal_is(reset, true);
+
+    process_blocks = 0;
+    read_A_len = 0;
+    read_B_len = 0;
+    compute_C_len = 0;
+    send_C_len = 0;
+    verbose = false;
+  }
+};
+
+template <typename Integer>
+void accelerator_dma_connect(ACCNAME *acc, DMA_DRIVER *dmad,
+                             int _dma_input_buffer_size,
+                             int _dma_output_buffer_size) {
+
+  static sc_clock clk_fast("ClkFast", 1, SC_NS);
+  static sc_signal<bool> sig_reset;
+  static sc_fifo<DATA> din1("din1_fifo", _dma_input_buffer_size);
+  static sc_fifo<DATA> dout1("dout1_fifo", _dma_output_buffer_size);
+
+  acc->clock(clk_fast);
+  acc->reset(sig_reset);
+  acc->dout1(dout1);
+  acc->din1(din1);
+
+  dmad->clock(clk_fast);
+  dmad->reset(sig_reset);
+  dmad->dout1(dout1);
+  dmad->din1(din1);
+}
+
+void ACCNAME::print_profile() {
+  ALOG("++++++++++++++++++++++++++++++++++++++++");
+  ALOG("Read A data_len: " << read_A_len);
+  ALOG("Read B data_len: " << read_B_len);
+  ALOG("MACs count: " << compute_C_len);
+  ALOG("Send C data_len: " << send_C_len);
+  ALOG("++++++++++++++++++++++++++++++++++++++++");
+  ALOG("Executed with :" << __FILE__);
+  ALOG("- - - - - - - - - - - - - - - - - - - - ");
+}
+
+void ACCNAME::Recv() {
+
+  wait();
+  while (1) {
+    opcode packet(din1.read().data);
+    code_extension op_args(din1.read().data);
+    acc_args = op_args;
+
+    if (packet.read_A) {
+      int read_length = op_args.M * op_args.K16;
+      for (int i = 0; i < read_length; i++) {
+        for (int j = 0; j < 16; j++) {
+          A_buffer[i][j] = din1.read().data;
+          read_A_len++;
+          DWAIT();
+        }
+      }
+    }
+
+    if (packet.read_B) {
+      int read_length = op_args.K * op_args.N16;
+      for (int i = 0; i < read_length; i++) {
+        for (int j = 0; j < 16; j++) {
+          B_buffer[i][j] = din1.read().data;
+          read_B_len++;
+          DWAIT();
+        }
+      }
+    }
+
+    // Computes C if true
+    if (packet.compute_C) {
+      compute.write(true);
+      wait();
+    }
+
+    while (compute)
+      wait();
+
+    // Sends then clears C if true
+    if (packet.send_C) {
+      send.write(true);
+      wait();
+    }
+
+    while (send)
+      wait();
+
+    wait();
+  }
+}
+
+void ACCNAME::LoadA(sc_int<32> A[PE_M][PE_K], su10 M, su10 K, su10 in_stride) {
+  su12 base = M * in_stride + K;
+  su12 offset = 0;
+  for (su10 m = 0; m < PE_M; m++) {
+    for (su10 k = 0; k < PE_K; k++) {
+      // #pragma HLS unroll
+      A[m][k] = A_buffer[base + offset][k];
+    }
+    offset += in_stride;
+  }
+}
+
+void ACCNAME::LoadB(sc_int<32> B[PE_K][PE_N], su10 K, su10 N, su10 in_stride) {
+  su12 base = K * in_stride + N;
+  su12 offset = 0;
+  for (su10 k = 0; k < PE_K; k++) {
+    for (su10 n = 0; n < PE_N; n++) {
+      // #pragma HLS unroll
+      B[k][n] = B_buffer[base + offset][n];
+    }
+    offset += in_stride;
+  }
+}
+
+void ACCNAME::Compute(sc_int<32> A[PE_M][PE_K], sc_int<32> B[PE_K][PE_N],
+                      sc_int<32> C[PE_M][PE_N]) {
+  for (int m = 0; m < PE_M; m++) {
+    for (int n = 0; n < PE_N; n++) {
+      // #pragma HLS pipeline
+      // #pragma HLS unroll factor 4
+      int acc = 0;
+      for (int k = 0; k < PE_K; k++) {
+        int x = A[m][k];
+        int y = B[k][n];
+        acc += mul_int32(x, y);
+        compute_C_len++;
+      }
+      C[m][n] = acc;
+    }
+  }
+}
+
+void ACCNAME::Store(sc_int<32> C[PE_M][PE_N], su10 M, su10 N, su10 out_stride) {
+  su12 base = M * out_stride + N;
+  su12 offset = 0;
+  for (su10 m = 0; m < PE_M; m++) {
+    // #pragma HLS pipeline
+    for (su10 n = 0; n < PE_N; n++) {
+      // #pragma HLS unroll
+      C_buffer[base + offset][n] += C[m][n];
+    }
+    offset += out_stride;
+  }
+}
+
+void ACCNAME::Schedule_Compute() {
+  sc_int<32> A[PE_M][PE_K];
+  sc_int<32> B[PE_K][PE_N];
+  sc_int<32> C[PE_M][PE_N];
+  // #pragma HLS array_partition variable = A complete dim = 2
+  // #pragma HLS array_partition variable = B complete dim = 2
+  // #pragma HLS array_partition variable = C complete dim = 2
+
+  wait();
+  while (1) {
+    while (!compute)
+      wait();
+
+    unsigned int ks = 0;
+    for (su10 k = 0; k < acc_args.K; k += PE_K) {
+      for (su10 m = 0; m < acc_args.M; m += PE_M) {
+        LoadA(A, m, ks, acc_args.K16);
+        for (su10 n = 0; n < acc_args.N16; n++) {
+          LoadB(B, k, n, acc_args.N16);
+          Compute(A, B, C);
+          Store(C, m, n, acc_args.N16);
+        }
+      }
+      ks++;
+    }
+
+    wait();
+    compute.write(false);
+    wait();
+  }
+}
+
+void ACCNAME::Send() {
+  wait();
+  while (1) {
+    while (!send)
+      wait();
+
+    unsigned int write_length = acc_args.M * acc_args.N16;
+    for (su10 m = 0; m < write_length; m++) {
+      for (su10 n = 0; n < 16; n++) {
+        DATA d;
+        d.tlast = false;
+        d.data = C_buffer[m][n];
+        if (n + 1 == 16 && m + 1 == write_length)
+          d.tlast = true;
+        dout1.write(d);
+        send_C_len++;
+        wait();
+        C_buffer[m][n] = 0;
+        DWAIT();
+      }
+    }
+    send.write(false);
+    wait();
+  }
+}
+
+int ACCNAME::mul_int32(int x, int y) { return x * y; }
+
+#endif
diff --git a/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v4/mm_4x4_v4.json b/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v4/mm_4x4_v4.json
new file mode 100644
index 0000000..1be763f
--- /dev/null
+++ b/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v4/mm_4x4_v4.json
@@ -0,0 +1,361 @@
+{
+    "name": "MM_4x4_v4",
+    "version": "1.0",
+    "description": "MM Accelerator",
+    "memory_layout": {
+        "#A_Buffer": {
+            "size": 4096,
+            "data_type": "int32"
+        },
+        "#B_Buffer": {
+            "size": 4096,
+            "data_type": "int32"
+        },
+        "#C_Buffer": {
+            "size": 4096,
+            "data_type": "int32"
+        }
+    },
+    "dma_fifo": {
+        "din": {
+            "id": "0",
+            "data_type": "int32",
+            "read": true,
+            "write": false
+        },
+        "dout": {
+            "id": "1",
+            "data_type": "int32",
+            "read": false,
+            "write": true
+        }
+    },
+    "kernels": {
+        "4x4_MM": {
+            "id": 0,
+            "description": "4x4 matrix multiplication",
+            "compute": "C += A * B",
+            "tile_info": {
+                "tile_dims": {
+                    "#tile_N": 4,
+                    "#tile_M": 4,
+                    "#tile_K": 4
+                },
+                "A": {
+                    "associated_buffer": "#A_Buffer",
+                    "read": true,
+                    "write": false,
+                    "default_offset": 0,
+                    "shape": [
+                        "#tile_N",
+                        "#tile_K"
+                    ]
+                },
+                "B": {
+                    "associated_buffer": "#B_Buffer",
+                    "read": true,
+                    "write": false,
+                    "default_offset": 0,
+                    "shape": [
+                        "#tile_M",
+                        "#tile_K"
+                    ],
+                    "stationary": true
+                },
+                "C": {
+                    "associated_buffer": "#C_Buffer",
+                    "read": true,
+                    "write": true,
+                    "default_offset": 0,
+                    "shape": [
+                        "#tile_N",
+                        "#tile_M"
+                    ]
+                }
+            }
+        }
+    },
+    "ISA": {
+        "instruction_format": {
+            "opcode_length": 32,
+            "op_extension": 64
+        },
+        "opcodes": {
+            "-": [],
+            "0": [],
+            "1": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#A_Buffer",
+                        "offset": 0,
+                        "length": "#N * #K"
+                    }
+                }
+            ],
+            "2": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#B_Buffer",
+                        "offset": 0,
+                        "length": "#M * #K"
+                    }
+                }
+            ],
+            "3": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#A_Buffer",
+                        "offset": 0,
+                        "length": "#N * #K"
+                    }
+                },
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#B_Buffer",
+                        "offset": 0,
+                        "length": "#M * #K"
+                    }
+                }
+            ],
+            "4": [
+                {
+                    "COMPUTE": {
+                        "kernel_id": 0
+                    }
+                }
+            ],
+            "5": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#A_Buffer",
+                        "offset": 0,
+                        "length": "#N * #K"
+                    }
+                },
+                {
+                    "COMPUTE": {
+                        "kernel_id": 0
+                    }
+                }
+            ],
+            "6": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#B_Buffer",
+                        "offset": 0,
+                        "length": "#M * #K"
+                    }
+                },
+                {
+                    "COMPUTE": {
+                        "kernel_id": 0
+                    }
+                }
+            ],
+            "7": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#A_Buffer",
+                        "offset": 0,
+                        "length": "#N * #K"
+                    }
+                },
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#B_Buffer",
+                        "offset": 0,
+                        "length": "#M * #K"
+                    }
+                },
+                {
+                    "COMPUTE": {
+                        "kernel_id": 0
+                    }
+                }
+            ],
+            "8": [
+                {
+                    "SEND": {
+                        "dma_fifo_id": 1,
+                        "buffer": "#C_Buffer",
+                        "offset": 0,
+                        "length": "#N * #M"
+                    }
+                }
+            ],
+            "9": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#A_Buffer",
+                        "offset": 0,
+                        "length": "#N * #K"
+                    }
+                },
+                {
+                    "SEND": {
+                        "dma_fifo_id": 1,
+                        "buffer": "#C_Buffer",
+                        "offset": 0,
+                        "length": "#N * #M"
+                    }
+                }
+            ],
+            "10": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#B_Buffer",
+                        "offset": 0,
+                        "length": "#M * #K"
+                    }
+                },
+                {
+                    "SEND": {
+                        "dma_fifo_id": 1,
+                        "buffer": "#C_Buffer",
+                        "offset": 0,
+                        "length": "#N * #M"
+                    }
+                }
+            ],
+            "11": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#A_Buffer",
+                        "offset": 0,
+                        "length": "#N * #K"
+                    }
+                },
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#B_Buffer",
+                        "offset": 0,
+                        "length": "#M * #K"
+                    }
+                },
+                {
+                    "SEND": {
+                        "dma_fifo_id": 1,
+                        "buffer": "#C_Buffer",
+                        "offset": 0,
+                        "length": "#N * #M"
+                    }
+                }
+            ],
+            "12": [
+                {
+                    "COMPUTE": {
+                        "kernel_id": 0
+                    }
+                },
+                {
+                    "SEND": {
+                        "dma_fifo_id": 1,
+                        "buffer": "#C_Buffer",
+                        "offset": 0,
+                        "length": "#N * #M"
+                    }
+                }
+            ],
+            "13": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#A_Buffer",
+                        "offset": 0,
+                        "length": "#N * #K"
+                    }
+                },
+                {
+                    "COMPUTE": {
+                        "kernel_id": 0
+                    }
+                },
+                {
+                    "SEND": {
+                        "dma_fifo_id": 1,
+                        "buffer": "#C_Buffer",
+                        "offset": 0,
+                        "length": "#N * #M"
+                    }
+                }
+            ],
+            "14": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#B_Buffer",
+                        "offset": 0,
+                        "length": "#M * #K"
+                    }
+                },
+                {
+                    "COMPUTE": {
+                        "kernel_id": 0
+                    }
+                },
+                {
+                    "SEND": {
+                        "dma_fifo_id": 1,
+                        "buffer": "#C_Buffer",
+                        "offset": 0,
+                        "length": "#N * #M"
+                    }
+                }
+            ],
+            "15": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#A_Buffer",
+                        "offset": 0,
+                        "length": "#N * #K"
+                    }
+                },
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#B_Buffer",
+                        "offset": 0,
+                        "length": "#M * #K"
+                    }
+                },
+                {
+                    "COMPUTE": {
+                        "kernel_id": 0
+                    }
+                },
+                {
+                    "SEND": {
+                        "dma_fifo_id": 1,
+                        "buffer": "#C_Buffer",
+                        "offset": 0,
+                        "length": "#N * #M"
+                    }
+                }
+            ]
+        },
+        "op_arg": {
+            "0-15": "#N",
+            "16-31": "#M",
+            "31-63": "#K"
+        }
+    },
+    "schedule": {
+        "allowed_patterns": [
+            "R#a, R#b, C, S#c",
+            "a"
+        ]
+    }
+}
\ No newline at end of file
diff --git a/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v5/accelerator.sc.h b/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v5/accelerator.sc.h
new file mode 100644
index 0000000..2fcd716
--- /dev/null
+++ b/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v5/accelerator.sc.h
@@ -0,0 +1,187 @@
+#ifndef ACC_H
+#define ACC_H
+
+#include "../dma_engine.sc.h"
+#define ACCNAME MM_4x4v5
+
+// OP-Code Stuct
+// 000 : 0 = read_A -> read_B -> compute_C;
+// 001 : 1 = store_C;
+
+
+struct opcode {
+  unsigned int packet;
+  bool store_C;
+
+  opcode(sc_uint<32> _packet) {
+    packet = _packet;
+    store_C = _packet.range(0, 0);
+    // ALOG("OPCODE: " << packet);
+    // ALOG("Time: " << sc_time_stamp());
+  }
+};
+
+
+SC_MODULE(ACCNAME) {
+  sc_in<bool> clock;
+  sc_in<bool> reset;
+  sc_int<32> A[16];
+  sc_int<32> B[16];
+  sc_int<32> C[16];
+  sc_fifo_in<DATA> din1;
+  sc_fifo_out<DATA> dout1;
+
+  // Debug variables
+  int process_blocks;
+  int read_A_len;
+  int read_B_len;
+  int compute_C_len;
+  int send_C_len;
+  bool verbose;
+
+#ifndef __SYNTHESIS__
+  sc_signal<bool, SC_MANY_WRITERS> compute;
+  sc_signal<bool, SC_MANY_WRITERS> send;
+#else
+  sc_signal<bool> compute;
+  sc_signal<bool> send;
+#endif
+
+  void Recv();
+
+  void Compute();
+
+  void Send();
+
+  void print_profile();
+
+  SC_HAS_PROCESS(ACCNAME);
+
+  ACCNAME(sc_module_name name_) : sc_module(name_) {
+    SC_CTHREAD(Recv, clock.pos());
+    reset_signal_is(reset, true);
+
+    SC_CTHREAD(Compute, clock.pos());
+    reset_signal_is(reset, true);
+
+    SC_CTHREAD(Send, clock.pos());
+    reset_signal_is(reset, true);
+
+    process_blocks = 0;
+    verbose = false;
+  }
+};
+
+template <typename Integer>
+void accelerator_dma_connect(ACCNAME *acc, DMA_DRIVER *dmad,
+                             int _dma_input_buffer_size,
+                             int _dma_output_buffer_size) {
+
+  static sc_clock clk_fast("ClkFast", 1, SC_NS);
+  static sc_signal<bool> sig_reset;
+  static sc_fifo<DATA> din1("din1_fifo", _dma_input_buffer_size);
+  static sc_fifo<DATA> dout1("dout1_fifo", _dma_output_buffer_size);
+
+  acc->clock(clk_fast);
+  acc->reset(sig_reset);
+  acc->dout1(dout1);
+  acc->din1(din1);
+
+  dmad->clock(clk_fast);
+  dmad->reset(sig_reset);
+  dmad->dout1(dout1);
+  dmad->din1(din1);
+}
+
+void ACCNAME::print_profile() {
+  cout << "++++++++++++++++++++++++++++++++++++++++" << endl;
+  cout << "Read A data_len: " << read_A_len << endl;
+  cout << "Read B data_len: " << read_B_len << endl;
+  cout << "MACs count: " << compute_C_len << endl;
+  cout << "Send C data_len: " << send_C_len << endl;
+  cout << "++++++++++++++++++++++++++++++++++++++++" << endl;
+  cout << "Executed with :" << __FILE__ << endl;
+  cout << "- - - - - - - - - - - - - - - - - - - - " << endl;;
+}
+
+void ACCNAME::Recv() {
+  wait();
+  while (1) {
+    while (compute)
+      wait();
+
+    opcode packet(din1.read().data);
+
+    if (packet.store_C) {
+      wait();
+      send.write(true);
+      wait();
+    }else{
+	  wait();
+	  for (int i = 0; i < 16; i++) {
+	    A[i] = din1.read().data;
+	    read_A_len++;
+	    DWAIT();
+	  }
+	  for (int i = 0; i < 16; i++) {
+	    B[i] = din1.read().data;
+	    read_B_len++;
+	    DWAIT();
+	  }
+	  compute.write(true);
+	  wait();
+    }
+
+    while(send.read() || compute.read())
+      wait();
+
+    wait();
+  }
+}
+
+void ACCNAME::Compute() {
+  wait();
+  while (1) {
+    while (!compute)
+      wait();
+
+    for (int i = 0; i < 4; i++) {
+      for (int w = 0; w < 4; w++) {
+        int acc = 0;
+        for (int d = 0; d < 4; d++) {
+
+          int x = A[i * 4 + d];
+          int y = B[w * 4 + d];
+          acc += x * y;
+          compute_C_len++;
+        }
+        C[i * 4 + w] += acc;
+      }
+    }
+    wait();
+    compute.write(false);
+    wait();
+  }
+}
+
+void ACCNAME::Send() {
+  wait();
+  while (1) {
+    while (!send)
+      wait();
+    for (int i = 0; i < 16; i++) {
+      DATA d;
+      d.tlast = false;
+      if (i == 15)
+        d.tlast = true;
+      d.data = C[i];
+      C[i] = 0;
+      dout1.write(d);
+      send_C_len++;
+      DWAIT();
+    }
+    send.write(false);
+    wait();
+  }
+}
+#endif
\ No newline at end of file
diff --git a/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v6/accelerator.sc.h b/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v6/accelerator.sc.h
new file mode 100644
index 0000000..6bc37c8
--- /dev/null
+++ b/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v6/accelerator.sc.h
@@ -0,0 +1,247 @@
+#ifndef ACC_H
+#define ACC_H
+
+#include "../dma_engine.sc.h"
+#define ACCNAME MM_4x4v6
+
+// OP-Code Stuct
+// 0000 : 0 = NOP;
+// 0001 : 1 = read_A;
+// 0010 : 2 = read_B;
+// 0011 : 3 = read_A -> read_B;
+// 0100 : 4 = compute_C;
+// 0101 : 5 = read_A -> compute_C;
+// 0110 : 6 = read_B -> compute_C;
+// 0111 : 7 = read_A -> read_B -> compute_C;
+
+// 1000 : 8 = send_C;
+// 1001 : 9 = read_A -> send_C;
+// 1010 : 10 = read_B -> send_C;
+// 1011 : 11 = read_A -> read_B -> send_C;
+// 1100 : 12 = compute_C -> send_C;
+// 1101 : 13 = read_A -> compute_C -> send_C;
+// 1110 : 14 = read_B -> compute_C -> send_C;
+// 1111 : 15 = read_A -> read_B -> compute_C -> send_C;
+
+struct opcode {
+  unsigned int packet;
+  bool read_A;
+  bool read_B;
+  bool compute_C;
+  bool send_C;
+
+  opcode(sc_uint<32> _packet) {
+    cout << "OPCODE: " << _packet << endl;
+    cout << "Time: " << sc_time_stamp() << endl;
+    packet = _packet;
+    read_A = _packet.range(0, 0);
+    read_B = _packet.range(1, 1);
+    compute_C = _packet.range(2, 2);
+    send_C = _packet.range(3, 3);
+  }
+};
+
+struct code_extension {
+  sc_uint<16> N;
+  sc_uint<16> M;
+
+  sc_uint<16> K;
+  sc_uint<16> in_stride;
+
+  sc_uint<16> out_offset;
+  sc_uint<16> out_stride;
+
+  code_extension(sc_uint<32> _packetA, sc_uint<32> _packetB,
+                 sc_uint<32> _packetC) {
+
+    N = _packetA.range(15,0);
+    M = _packetA.range(31,16);
+
+    K = _packetB.range(15,0);
+    in_stride = _packetB.range(31,16);
+
+    out_offset = _packetC.range(15,0);
+    out_stride = _packetC.range(31,16);
+
+    cout << "Time: " << sc_time_stamp() << endl;
+    cout << "N: " << N << ", M: " << M << ", K: " << K
+         << ", in_stride: " << in_stride << ", out_offset: " << out_offset
+         << ", out_stride: " << out_stride << endl;
+  }
+};
+
+SC_MODULE(ACCNAME) {
+  sc_in<bool> clock;
+  sc_in<bool> reset;
+  sc_int<32> A_buffer[4096];
+  sc_int<32> B_buffer[4096];
+  sc_int<32> C_buffer[4096];
+  sc_fifo_in<DATA> din1;
+  sc_fifo_out<DATA> dout1;
+
+  // Debug variables
+  int process_blocks;
+  bool verbose;
+
+#ifndef __SYNTHESIS__
+  sc_signal<bool, SC_MANY_WRITERS> compute;
+  sc_signal<bool, SC_MANY_WRITERS> send;
+#else
+  sc_signal<bool> compute;
+  sc_signal<bool> send;
+#endif
+
+  code_extension acc_args = code_extension(0,0,0);
+
+  void Recv();
+
+  void Compute(int, int, int, int, int, int);
+
+  void Schedule_Compute();
+
+  void Send();
+
+  SC_HAS_PROCESS(ACCNAME);
+
+  ACCNAME(sc_module_name name_) : sc_module(name_) {
+    SC_CTHREAD(Recv, clock.pos());
+    reset_signal_is(reset, true);
+
+    SC_CTHREAD(Schedule_Compute, clock.pos());
+    reset_signal_is(reset, true);
+
+    SC_CTHREAD(Send, clock.pos());
+    reset_signal_is(reset, true);
+
+    process_blocks = 0;
+    verbose = false;
+  }
+};
+
+template <typename Integer>
+void accelerator_dma_connect(ACCNAME *acc, DMA_DRIVER *dmad,
+                             int _dma_input_buffer_size,
+                             int _dma_output_buffer_size) {
+
+  static sc_clock clk_fast("ClkFast", 1, SC_NS);
+  static sc_signal<bool> sig_reset;
+  static sc_fifo<DATA> din1("din1_fifo", _dma_input_buffer_size);
+  static sc_fifo<DATA> dout1("dout1_fifo", _dma_output_buffer_size);
+
+  acc->clock(clk_fast);
+  acc->reset(sig_reset);
+  acc->dout1(dout1);
+  acc->din1(din1);
+
+  dmad->clock(clk_fast);
+  dmad->reset(sig_reset);
+  dmad->dout1(dout1);
+  dmad->din1(din1);
+}
+
+void ACCNAME::Recv() {
+  wait();
+  while (1) {
+    opcode packet(din1.read().data);
+    code_extension op_args(din1.read().data, din1.read().data,
+                           din1.read().data);
+    acc_args = op_args;
+
+    if (packet.read_A) {
+      unsigned int read_length = op_args.N * op_args.K;
+      for (int i = 0; i < read_length; i++) {
+        A_buffer[i] = din1.read().data;
+        DWAIT();
+      }
+    }
+
+    if (packet.read_B) {
+      unsigned int read_length = op_args.M * op_args.K;
+      for (int i = 0; i < read_length; i++) {
+        B_buffer[i] = din1.read().data;
+        DWAIT();
+      }
+    }
+
+    // Computes C if true
+    if (packet.compute_C) {
+      compute.write(true);
+      wait();
+    }
+
+    while (compute)
+      wait();
+
+    // Sends then clears C if true
+    if (packet.send_C) {
+      send.write(true);
+      wait();
+    }
+
+    while (send)
+      wait();
+
+    wait();
+  }
+}
+
+void ACCNAME::Compute(int N, int M, int K, int in_stride, int out_offset,
+                      int out_stride) {
+  for (int n = 0; n < 4; n++) {
+    for (int m = 0; m < 4; m++) {
+      int acc = 0;
+      for (int k = 0; k < 4; k++) {
+        int a_data = A_buffer[(N + n) * in_stride + K + k];
+        int b_data = B_buffer[(M + m) * in_stride + K + k];
+        acc += a_data * b_data;
+      }
+      C_buffer[out_offset + (N + n) * out_stride + M + m] += acc;
+    }
+  }
+}
+
+void ACCNAME::Schedule_Compute() {
+  wait();
+  while (1) {
+    while (!compute)
+      wait();
+
+    for (int n = 0; n < acc_args.N; n += 4) {
+      for (int m = 0; m < acc_args.M; m += 4) {
+        for (int k = 0; k < acc_args.K; k += 4) {
+          Compute(n, m, k, acc_args.in_stride, acc_args.out_offset,
+                  acc_args.out_stride);
+        }
+      }
+    }
+
+    wait();
+    compute.write(false);
+    wait();
+  }
+}
+
+void ACCNAME::Send() {
+  wait();
+  while (1) {
+    while (!send)
+      wait();
+
+    for (int n = 0; n < acc_args.N; n++) {
+      for (int m = 0; m < acc_args.M; m++) {
+        DATA d;
+        d.tlast = false;
+        d.data = C_buffer[acc_args.out_offset + n * acc_args.out_stride + m];
+        if (n + 1 == acc_args.N && m + 1 == acc_args.M)
+          d.tlast = true;
+        dout1.write(d);
+        C_buffer[acc_args.out_offset + n * acc_args.out_stride + m] = 0;
+      }
+    }
+    send.write(false);
+    wait();
+  }
+}
+
+
+#endif
diff --git a/include/soda/ExecutionEngine/axi/accelerators/mmt_4x4_v1/accelerator.sc.h b/include/soda/ExecutionEngine/axi/accelerators/mmt_4x4_v1/accelerator.sc.h
new file mode 100644
index 0000000..434c75b
--- /dev/null
+++ b/include/soda/ExecutionEngine/axi/accelerators/mmt_4x4_v1/accelerator.sc.h
@@ -0,0 +1,200 @@
+#ifndef ACC_H
+#define ACC_H
+
+#include "../dma_engine.sc.h"
+#define ACCNAME MM_4x4v1
+
+SC_MODULE(ACCNAME) {
+  sc_in<bool> clock;
+  sc_in<bool> reset;
+  sc_int<32> inputs[4096];
+  sc_int<32> weights[4096];
+  sc_int<32> outputs[4096];
+  sc_fifo_in<DATA> din1;
+  sc_fifo_out<DATA> dout1;
+
+  // Debug variables
+  int process_blocks;
+  int read_A_len;
+  int read_B_len;
+  int compute_C_len;
+  int send_C_len;
+  bool verbose;
+
+#ifndef __SYNTHESIS__
+  sc_signal<bool, SC_MANY_WRITERS> compute;
+  sc_signal<bool, SC_MANY_WRITERS> send;
+#else
+  sc_signal<bool> compute;
+  sc_signal<bool> send;
+#endif
+
+  void Recv();
+
+  void Compute();
+
+  void Send();
+
+  void print_profile();
+
+  SC_HAS_PROCESS(ACCNAME);
+
+  ACCNAME(sc_module_name name_) : sc_module(name_) {
+    SC_CTHREAD(Recv, clock.pos());
+    reset_signal_is(reset, true);
+
+    SC_CTHREAD(Compute, clock.pos());
+    reset_signal_is(reset, true);
+
+    SC_CTHREAD(Send, clock.pos());
+    reset_signal_is(reset, true);
+
+    process_blocks = 0;
+    read_A_len=0;
+    read_B_len=0;
+    compute_C_len=0;
+    send_C_len=0;
+    verbose = false;
+
+    // #pragma HLS RESOURCE variable=din1 core=AXI4Stream metadata="-bus_bundle
+    // S_AXIS_DATA1" port_map={{din1_0 TDATA} {din1_1 TLAST}} #pragma HLS
+    // RESOURCE variable=dout1 core=AXI4Stream metadata="-bus_bundle
+    // M_AXIS_DATA1" port_map={{dout1_0 TDATA} {dout1_1 TLAST}} #pragma HLS
+    // RESET variable=reset
+  }
+};
+
+template <typename Integer>
+void accelerator_dma_connect(ACCNAME *acc, DMA_DRIVER *dmad,
+                             int _dma_input_buffer_size,
+                             int _dma_output_buffer_size) {
+
+  static sc_clock clk_fast("ClkFast", 1, SC_NS);
+  static sc_signal<bool> sig_reset;
+  static sc_fifo<DATA> din1("din1_fifo", _dma_input_buffer_size);
+  static sc_fifo<DATA> dout1("dout1_fifo", _dma_output_buffer_size);
+
+  acc->clock(clk_fast);
+  acc->reset(sig_reset);
+  acc->dout1(dout1);
+  acc->din1(din1);
+
+  dmad->clock(clk_fast);
+  dmad->reset(sig_reset);
+  dmad->dout1(dout1);
+  dmad->din1(din1);
+}
+
+void ACCNAME::print_profile() {
+  cout << "++++++++++++++++++++++++++++++++++++++++" << endl;
+  cout << "Read A data_len: " << read_A_len << endl;
+  cout << "Read B data_len: " << read_B_len << endl;
+  cout << "MACs count: " << compute_C_len << endl;
+  cout << "Send C data_len: " << send_C_len << endl;
+  cout << "++++++++++++++++++++++++++++++++++++++++" << endl;
+  cout << "Executed with :" << __FILE__ << endl;
+  cout << "- - - - - - - - - - - - - - - - - - - - " << endl;;
+}
+
+void ACCNAME::Recv() {
+  wait();
+  while (1) {
+    while (compute)
+      wait();
+
+    for (int i = 0; i < 16; i++) {
+      inputs[i] = din1.read().data;
+      read_A_len++;
+      DWAIT();
+    }
+
+    for (int i = 0; i < 16; i++) {
+      weights[i] = din1.read().data;
+      read_B_len++;
+      DWAIT();
+    }
+
+    // DEBUG ONLY
+    if (true) {
+      cout << "=========================" << endl;
+      cout << "BLOCK: " << process_blocks++ << endl;
+      cout << "=========================" << endl;
+      for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 4; j++)
+          cout << inputs[i * 4 + j] << ",";
+        cout << endl;
+      }
+      cout << "=========================" << endl;
+      for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 4; j++)
+          cout << weights[i * 4 + j] << ",";
+        cout << endl;
+      }
+      cout << "=========================" << endl;
+    }
+    // DEBUG ONLY
+
+    wait();
+    compute.write(true);
+    wait();
+  }
+}
+
+void ACCNAME::Compute() {
+  wait();
+  while (1) {
+    while (!compute)
+      wait();
+    for (int i = 0; i < 4; i++) {
+      for (int w = 0; w < 4; w++) {
+        int acc = 0;
+        for (int d = 0; d < 4; d++) {
+          int x = inputs[i * 4 + d];
+          int y = weights[w * 4 + d];
+          acc += x * y;
+          compute_C_len++;
+        }
+        outputs[i * 4 + w] = acc;
+      }
+    }
+
+    // DEBUG ONLY
+    if (verbose) {
+      cout << "=========================" << endl;
+      for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 4; j++)
+          cout << outputs[i * 4 + j] << ",";
+        cout << endl;
+      }
+      cout << "=========================" << endl;
+    }
+    // DEBUG ONLY
+
+    wait();
+    compute.write(false);
+    send.write(true);
+    wait();
+  }
+}
+
+void ACCNAME::Send() {
+  wait();
+  while (1) {
+    while (!send)
+      wait();
+    for (int i = 0; i < 16; i++) {
+      DATA d;
+      d.tlast = false;
+      if (i == 15)
+        d.tlast = true;
+      d.data = outputs[i];
+      dout1.write(d);
+      send_C_len++;
+      DWAIT();
+    }
+    send.write(false);
+    wait();
+  }
+}
+
+#endif
diff --git a/include/soda/ExecutionEngine/axi/accelerators/mmt_4x4_v1/mm_4x4_v1.json b/include/soda/ExecutionEngine/axi/accelerators/mmt_4x4_v1/mm_4x4_v1.json
new file mode 100644
index 0000000..e997d57
--- /dev/null
+++ b/include/soda/ExecutionEngine/axi/accelerators/mmt_4x4_v1/mm_4x4_v1.json
@@ -0,0 +1,120 @@
+{
+    "name": "MM_4x4_v1",
+    "version": "1.0",
+    "description": "MM Accelerator",
+
+    "memory_layout": {
+        "#A_Buffer": {
+            "size": 16,
+            "data_type": "int32"
+        },
+        "#B_Buffer": {
+            "size": 16,
+            "data_type": "int32"
+        },
+        "#C_Buffer": {
+            "size": 16,
+            "data_type": "int32"
+        }
+    },
+    
+    "dma_fifo": {
+        "din": {
+            "id": "0",
+            "data_type": "int32",
+            "read": true,
+            "write": false
+        },
+        "dout": {
+            "id": "1",
+            "data_type": "int32",
+            "read": false,
+            "write": true
+        }
+    },
+
+    "kernels": {
+        "4x4_MM": {
+            "id": 0,
+            "description": "4x4 matrix multiplication",
+            "compute": "C += A * B",
+            "tile_info": {
+                "tile_dims": {
+                    "#N": 4,
+                    "#M": 4,
+                    "#K": 4
+                },
+                "A": {
+                    "associated_buffer": "#A_Buffer",
+                    "read": true,
+                    "write": false,
+                    "default_offset": 0,
+                    "shape": [
+                        "#N",
+                        "#K"
+                    ]
+                },
+                "B": {
+                    "associated_buffer": "#B_Buffer",
+                    "read": true,
+                    "write": false,
+                    "default_offset": 0,
+                    "shape": [
+                        "#M",
+                        "#K"
+                    ]
+                },
+                "C": {
+                    "associated_buffer": "#C_Buffer",
+                    "read": true,
+                    "write": true,
+                    "default_offset": 0,
+                    "shape": [
+                        "#N",
+                        "#M"
+                    ]
+                }
+            }
+        }
+    },
+    
+    "ISA": {
+        "instruction_format": {
+            "opcode_length": 0,
+            "op_args": 0
+        },
+        "opcodes": {
+            "-": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#A_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                },
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#B_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                },
+                {
+                    "COMPUTE": {
+                        "kernel_id": 0
+                    }
+                },
+                {
+                    "SEND": {
+                        "dma_fifo_id": 1,
+                        "buffer": "#C_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                }
+            ]
+        }
+    }
+}
\ No newline at end of file
diff --git a/include/soda/ExecutionEngine/axi/accelerators/mmt_4x4_v2/accelerator.sc.h b/include/soda/ExecutionEngine/axi/accelerators/mmt_4x4_v2/accelerator.sc.h
new file mode 100644
index 0000000..ccfb1a3
--- /dev/null
+++ b/include/soda/ExecutionEngine/axi/accelerators/mmt_4x4_v2/accelerator.sc.h
@@ -0,0 +1,241 @@
+#ifndef ACC_H
+#define ACC_H
+
+#include "../dma_engine.sc.h"
+#define ACCNAME MMT_4x4v2
+
+#ifdef VERBOSE_ACC
+#define ALOG(x) std::cout << x << std::endl
+#else
+#define ALOG(x)
+#endif
+
+// OP-Code Stuct
+// 000 : 0 = NOP;
+// 001 : 1 = read_A;
+// 010 : 2 = read_B;
+// 011 : 3 = read_A -> read_B;
+// 100 : 4 = compute_C;
+// 101 : 5 = read_A -> compute_C;
+// 110 : 6 = read_B -> compute_C;
+// 111 : 7 = read_A -> read_B -> compute_C;
+
+struct opcode {
+  unsigned int packet;
+  bool read_A;
+  bool read_B;
+  bool compute_C;
+
+  opcode(sc_uint<32> _packet) {
+    ALOG("OPCODE: " << _packet);
+    ALOG("Time: " << sc_time_stamp());
+    packet = _packet;
+    read_A = _packet.range(0, 0);
+    read_B = _packet.range(1, 1);
+    compute_C = _packet.range(2, 2);
+  }
+};
+
+SC_MODULE(ACCNAME) {
+  sc_in<bool> clock;
+  sc_in<bool> reset;
+  sc_int<32> inputs[4096];
+  sc_int<32> weights[4096];
+  sc_int<32> outputs[4096];
+  sc_fifo_in<DATA> din1;
+  sc_fifo_out<DATA> dout1;
+
+  // Debug variables
+  int process_blocks;
+  int read_A_len;
+  int read_B_len;
+  int compute_C_len;
+  int send_C_len;
+  bool verbose;
+
+#ifndef __SYNTHESIS__
+  sc_signal<bool, SC_MANY_WRITERS> compute;
+  sc_signal<bool, SC_MANY_WRITERS> send;
+#else
+  sc_signal<bool> compute;
+  sc_signal<bool> send;
+#endif
+
+  void Recv();
+
+  void Compute();
+
+  void Send();
+
+  void print_profile();
+
+  SC_HAS_PROCESS(ACCNAME);
+
+  ACCNAME(sc_module_name name_) : sc_module(name_) {
+    SC_CTHREAD(Recv, clock.pos());
+    reset_signal_is(reset, true);
+
+    SC_CTHREAD(Compute, clock.pos());
+    reset_signal_is(reset, true);
+
+    SC_CTHREAD(Send, clock.pos());
+    reset_signal_is(reset, true);
+
+    process_blocks = 0;
+    read_A_len=0;
+    read_B_len=0;
+    compute_C_len=0;
+    send_C_len=0;
+    verbose = false;
+
+    // #pragma HLS RESOURCE variable=din1 core=AXI4Stream metadata="-bus_bundle
+    // S_AXIS_DATA1" port_map={{din1_0 TDATA} {din1_1 TLAST}} #pragma HLS
+    // RESOURCE variable=dout1 core=AXI4Stream metadata="-bus_bundle
+    // M_AXIS_DATA1" port_map={{dout1_0 TDATA} {dout1_1 TLAST}} #pragma HLS
+    // RESET variable=reset
+  }
+};
+
+template <typename Integer>
+void accelerator_dma_connect(ACCNAME *acc, DMA_DRIVER *dmad,
+                             int _dma_input_buffer_size,
+                             int _dma_output_buffer_size) {
+
+  static sc_clock clk_fast("ClkFast", 1, SC_NS);
+  static sc_signal<bool> sig_reset;
+  static sc_fifo<DATA> din1("din1_fifo", _dma_input_buffer_size);
+  static sc_fifo<DATA> dout1("dout1_fifo", _dma_output_buffer_size);
+
+  acc->clock(clk_fast);
+  acc->reset(sig_reset);
+  acc->dout1(dout1);
+  acc->din1(din1);
+
+  dmad->clock(clk_fast);
+  dmad->reset(sig_reset);
+  dmad->dout1(dout1);
+  dmad->din1(din1);
+}
+
+void ACCNAME::print_profile() {
+  cout << "++++++++++++++++++++++++++++++++++++++++" << endl;
+  cout << "Read A data_len: " << read_A_len << endl;
+  cout << "Read B data_len: " << read_B_len << endl;
+  cout << "MACs count: " << compute_C_len << endl;
+  cout << "Send C data_len: " << send_C_len << endl;
+  cout << "++++++++++++++++++++++++++++++++++++++++" << endl;
+  cout << "Executed with :" << __FILE__ << endl;
+  cout << "- - - - - - - - - - - - - - - - - - - - " << endl;;
+}
+
+
+void ACCNAME::Recv() {
+  wait();
+  while (1) {
+    while (compute)
+      wait();
+
+    opcode packet(din1.read().data);
+
+    if (packet.read_A) {
+      for (int i = 0; i < 16; i++) {
+        inputs[i] = din1.read().data;
+        read_A_len++;
+        DWAIT();
+      }
+    }
+
+    if (packet.read_B) {
+      for (int i = 0; i < 16; i++) {
+        weights[i] = din1.read().data;
+        read_B_len++;
+        DWAIT();
+      }
+    }
+
+    // DEBUG ONLY
+    if (verbose) {
+      cout << "=========================" << endl;
+      cout << "BLOCK: " << process_blocks++ << endl;
+      cout << "=========================" << endl;
+      for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 4; j++)
+          cout << inputs[i * 4 + j] << ",";
+        cout << endl;
+      }
+      cout << "=========================" << endl;
+      for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 4; j++)
+          cout << weights[i * 4 + j] << ",";
+        cout << endl;
+      }
+      cout << "=========================" << endl;
+    }
+    // DEBUG ONLY
+
+    if (packet.compute_C) {
+      wait();
+      compute.write(true);
+    }
+    wait();
+  }
+}
+
+void ACCNAME::Compute() {
+  wait();
+  while (1) {
+    while (!compute)
+      wait();
+    for (int i = 0; i < 4; i++) {
+      for (int w = 0; w < 4; w++) {
+        int acc = 0;
+        for (int d = 0; d < 4; d++) {
+          int x = inputs[i * 4 + d];
+          int y = weights[w * 4 + d];
+          acc += x * y;
+          compute_C_len++;
+        }
+        outputs[i * 4 + w] = acc;
+      }
+    }
+
+    // DEBUG ONLY
+    if (verbose) {
+      cout << "=========================" << endl;
+      for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 4; j++)
+          cout << outputs[i * 4 + j] << ",";
+        cout << endl;
+      }
+      cout << "=========================" << endl;
+    }
+    // DEBUG ONLY
+
+    wait();
+    compute.write(false);
+    send.write(true);
+    wait();
+  }
+}
+
+void ACCNAME::Send() {
+  wait();
+  while (1) {
+    while (!send)
+      wait();
+    for (int i = 0; i < 16; i++) {
+      DATA d;
+      d.tlast = false;
+      if (i == 15)
+        d.tlast = true;
+      d.data = outputs[i];
+      dout1.write(d);
+      send_C_len++;
+      DWAIT();
+    }
+    send.write(false);
+    wait();
+  }
+}
+
+#endif
diff --git a/include/soda/ExecutionEngine/axi/accelerators/mmt_4x4_v2/mm_4x4_v2.json b/include/soda/ExecutionEngine/axi/accelerators/mmt_4x4_v2/mm_4x4_v2.json
new file mode 100644
index 0000000..e469ab4
--- /dev/null
+++ b/include/soda/ExecutionEngine/axi/accelerators/mmt_4x4_v2/mm_4x4_v2.json
@@ -0,0 +1,218 @@
+{
+    "name": "MM_4x4_v2",
+    "version": "1.0",
+    "description": "MM Accelerator",
+    "memory_layout": {
+        "#A_Buffer": {
+            "size": 16,
+            "data_type": "int32"
+        },
+        "#B_Buffer": {
+            "size": 16,
+            "data_type": "int32"
+        },
+        "#C_Buffer": {
+            "size": 16,
+            "data_type": "int32"
+        }
+    },
+    "dma_fifo": {
+        "din": {
+            "id": "0",
+            "data_type": "int32",
+            "read": true,
+            "write": false
+        },
+        "dout": {
+            "id": "1",
+            "data_type": "int32",
+            "read": false,
+            "write": true
+        }
+    },
+    "kernels": {
+        "4x4_MM": {
+            "id": 0,
+            "description": "4x4 matrix multiplication",
+            "compute": "C += A * B",
+            "tile_info": {
+                "tile_dims": {
+                    "#N": 4,
+                    "#M": 4,
+                    "#K": 4
+                },
+                "A": {
+                    "associated_buffer": "#A_Buffer",
+                    "read": true,
+                    "write": false,
+                    "default_offset": 0,
+                    "shape": [
+                        "#N",
+                        "#K"
+                    ]
+                },
+                "B": {
+                    "associated_buffer": "#B_Buffer",
+                    "read": true,
+                    "write": false,
+                    "default_offset": 0,
+                    "shape": [
+                        "#M",
+                        "#K"
+                    ],
+                    "stationary": true
+                },
+                "C": {
+                    "associated_buffer": "#C_Buffer",
+                    "read": true,
+                    "write": true,
+                    "default_offset": 0,
+                    "shape": [
+                        "#N",
+                        "#M"
+                    ]
+                }
+            }
+        }
+    },
+    "ISA": {
+        "instruction_format": {
+            "opcode_length": 32,
+            "op_args": 0
+        },
+        "opcodes": {
+            "-": [],
+            "0": [],
+            "1": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#A_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                }
+            ],
+            "2": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#B_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                }
+            ],
+            "3": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#A_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                },
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#B_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                }
+            ],
+            "4": [
+                {
+                    "COMPUTE": {
+                        "kernel_id": 0
+                    }
+                },
+                {
+                    "SEND": {
+                        "dma_fifo_id": 1,
+                        "buffer": "#C_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                }
+            ],
+            "5": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#A_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                },
+                {
+                    "COMPUTE": {
+                        "kernel_id": 0
+                    }
+                },
+                {
+                    "SEND": {
+                        "dma_fifo_id": 1,
+                        "buffer": "#C_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                }
+            ],
+            "6": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#B_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                },
+                {
+                    "COMPUTE": {
+                        "kernel_id": 0
+                    }
+                },
+                {
+                    "SEND": {
+                        "dma_fifo_id": 1,
+                        "buffer": "#C_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                }
+            ],
+            "7": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#A_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                },
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#B_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                },
+                {
+                    "COMPUTE": {
+                        "kernel_id": 0
+                    }
+                },
+                {
+                    "SEND": {
+                        "dma_fifo_id": 1,
+                        "buffer": "#C_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                }
+            ]
+        }
+    }
+}
\ No newline at end of file
diff --git a/include/soda/ExecutionEngine/axi/accelerators/mmt_4x4_v3/accelerator.sc.h b/include/soda/ExecutionEngine/axi/accelerators/mmt_4x4_v3/accelerator.sc.h
new file mode 100644
index 0000000..c9b5ebc
--- /dev/null
+++ b/include/soda/ExecutionEngine/axi/accelerators/mmt_4x4_v3/accelerator.sc.h
@@ -0,0 +1,263 @@
+#ifndef ACC_H
+#define ACC_H
+
+#include "../dma_engine.sc.h"
+#define ACCNAME MM_4x4v3
+
+#ifdef VERBOSE_ACC
+#define ALOG(x) std::cout << x << std::endl
+#else
+#define ALOG(x)
+#endif
+
+// OP-Code Stuct
+// 0000 : 0 = NOP;
+// 0001 : 1 = read_A;
+// 0010 : 2 = read_B;
+// 0011 : 3 = read_A -> read_B;
+// 0100 : 4 = compute_C;
+// 0101 : 5 = read_A -> compute_C;
+// 0110 : 6 = read_B -> compute_C;
+// 0111 : 7 = read_A -> read_B -> compute_C;
+
+// 1000 : 8 = send_C;
+// 1001 : 9 = read_A -> send_C;
+// 1010 : 10 = read_B -> send_C;
+// 1011 : 11 = read_A -> read_B -> send_C;
+// 1100 : 12 = compute_C -> send_C;
+// 1101 : 13 = read_A -> compute_C -> send_C;
+// 1110 : 14 = read_B -> compute_C -> send_C;
+// 1111 : 15 = read_A -> read_B -> compute_C -> send_C;
+
+struct opcode {
+  unsigned int packet;
+  bool read_A;
+  bool read_B;
+  bool compute_C;
+  bool send_C;
+
+  opcode(sc_uint<32> _packet) {
+    ALOG("OPCODE: " << _packet);
+    ALOG("Time: " << sc_time_stamp());
+    packet = _packet;
+    read_A = _packet.range(0, 0);
+    read_B = _packet.range(1, 1);
+    compute_C = _packet.range(2, 2);
+    send_C = _packet.range(3, 3);
+  }
+};
+
+SC_MODULE(ACCNAME) {
+  sc_in<bool> clock;
+  sc_in<bool> reset;
+  sc_int<32> inputs[4096];
+  sc_int<32> weights[4096];
+  sc_int<32> outputs[4096];
+  sc_fifo_in<DATA> din1;
+  sc_fifo_out<DATA> dout1;
+
+  // Debug variables
+  int process_blocks;
+  int read_A_len;
+  int read_B_len;
+  int compute_C_len;
+  int send_C_len;
+  bool verbose;
+
+#ifndef __SYNTHESIS__
+  sc_signal<bool, SC_MANY_WRITERS> compute;
+  sc_signal<bool, SC_MANY_WRITERS> send;
+#else
+  sc_signal<bool> compute;
+  sc_signal<bool> send;
+#endif
+
+  void Recv();
+
+  void Compute();
+
+  void Send();
+
+  void print_profile();
+
+  SC_HAS_PROCESS(ACCNAME);
+
+  ACCNAME(sc_module_name name_) : sc_module(name_) {
+    SC_CTHREAD(Recv, clock.pos());
+    reset_signal_is(reset, true);
+
+    SC_CTHREAD(Compute, clock.pos());
+    reset_signal_is(reset, true);
+
+    SC_CTHREAD(Send, clock.pos());
+    reset_signal_is(reset, true);
+
+    process_blocks = 0;
+    read_A_len=0;
+    read_B_len=0;
+    compute_C_len=0;
+    send_C_len=0;
+    verbose = false;
+
+    // #pragma HLS RESOURCE variable=din1 core=AXI4Stream metadata="-bus_bundle
+    // S_AXIS_DATA1" port_map={{din1_0 TDATA} {din1_1 TLAST}} #pragma HLS
+    // RESOURCE variable=dout1 core=AXI4Stream metadata="-bus_bundle
+    // M_AXIS_DATA1" port_map={{dout1_0 TDATA} {dout1_1 TLAST}} #pragma HLS
+    // RESET variable=reset
+  }
+};
+
+template <typename Integer>
+void accelerator_dma_connect(ACCNAME *acc, DMA_DRIVER *dmad,
+                             int _dma_input_buffer_size,
+                             int _dma_output_buffer_size) {
+
+  static sc_clock clk_fast("ClkFast", 1, SC_NS);
+  static sc_signal<bool> sig_reset;
+  static sc_fifo<DATA> din1("din1_fifo", _dma_input_buffer_size);
+  static sc_fifo<DATA> dout1("dout1_fifo", _dma_output_buffer_size);
+
+  acc->clock(clk_fast);
+  acc->reset(sig_reset);
+  acc->dout1(dout1);
+  acc->din1(din1);
+
+  dmad->clock(clk_fast);
+  dmad->reset(sig_reset);
+  dmad->dout1(dout1);
+  dmad->din1(din1);
+}
+
+void ACCNAME::print_profile() {
+  cout << "++++++++++++++++++++++++++++++++++++++++" << endl;
+  cout << "Read A data_len: " << read_A_len << endl;
+  cout << "Read B data_len: " << read_B_len << endl;
+  cout << "MACs count: " << compute_C_len << endl;
+  cout << "Send C data_len: " << send_C_len << endl;
+  cout << "++++++++++++++++++++++++++++++++++++++++" << endl;
+  cout << "Executed with :" << __FILE__ << endl;
+  cout << "- - - - - - - - - - - - - - - - - - - - " << endl;
+}
+
+void ACCNAME::Recv() {
+  wait();
+  while (1) {
+    opcode packet(din1.read().data);
+
+    if (packet.read_A) {
+      for (int i = 0; i < 16; i++) {
+        inputs[i] = din1.read().data;
+        read_A_len++;
+        DWAIT();
+      }
+    }
+
+    if (packet.read_B) {
+      for (int i = 0; i < 16; i++) {
+        weights[i] = din1.read().data;
+        read_B_len++;
+        DWAIT();
+      }
+    }
+
+    // DEBUG ONLY
+    if (verbose) {
+      cout << "=========================" << endl;
+      cout << "BLOCK: " << process_blocks++ << endl;
+      cout << "=========================" << endl;
+      for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 4; j++)
+          cout << inputs[i * 4 + j] << ",";
+        cout << endl;
+      }
+      cout << "=========================" << endl;
+      for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 4; j++)
+          cout << weights[i * 4 + j] << ",";
+        cout << endl;
+      }
+      cout << "=========================" << endl;
+    }
+    // DEBUG ONLY
+
+    // Computes C if true
+    if (packet.compute_C) {
+      wait();
+      compute.write(true);
+    }
+
+    while (compute)
+      wait();
+
+    // Sends then clears C if true
+    if (packet.send_C) {
+      wait();
+      send.write(true);
+    }
+
+    while (send)
+      wait();
+
+    wait();
+  }
+}
+
+void ACCNAME::Compute() {
+  wait();
+  while (1) {
+    while (!compute)
+      wait();
+    for (int i = 0; i < 4; i++) {
+      for (int w = 0; w < 4; w++) {
+        int acc = 0;
+        for (int d = 0; d < 4; d++) {
+          int x = inputs[i * 4 + d];
+          int y = weights[d * 4 + w];
+          // int y = weights[w * 4 + d];
+          acc += x * y;
+          compute_C_len++;
+        }
+        outputs[i * 4 + w] += acc;
+      }
+    }
+
+    // DEBUG ONLY
+    if (verbose) {
+      cout << "=========================" << endl;
+      for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 4; j++)
+          cout << outputs[i * 4 + j] << ",";
+        cout << endl;
+      }
+      cout << "=========================" << endl;
+    }
+    // DEBUG ONLY
+
+    wait();
+    compute.write(false);
+    wait();
+  }
+}
+
+void ACCNAME::Send() {
+  wait();
+  while (1) {
+    while (!send)
+      wait();
+    for (int i = 0; i < 16; i++) {
+      DATA d;
+      d.tlast = false;
+      if (i == 15)
+        d.tlast = true;
+      d.data = outputs[i];
+      dout1.write(d);
+      outputs[i] = 0; // Clears after sends
+      send_C_len++;
+      DWAIT();
+    }
+    send.write(false);
+    wait();
+  }
+}
+
+#endif
diff --git a/include/soda/ExecutionEngine/axi/accelerators/mmt_4x4_v3/mm_4x4_v3.json b/include/soda/ExecutionEngine/axi/accelerators/mmt_4x4_v3/mm_4x4_v3.json
new file mode 100644
index 0000000..83c0a89
--- /dev/null
+++ b/include/soda/ExecutionEngine/axi/accelerators/mmt_4x4_v3/mm_4x4_v3.json
@@ -0,0 +1,352 @@
+{
+    "name": "MM_4x4_v3",
+    "version": "1.0",
+    "description": "MM Accelerator",
+    "memory_layout": {
+        "#A_Buffer": {
+            "size": 16,
+            "data_type": "int32"
+        },
+        "#B_Buffer": {
+            "size": 16,
+            "data_type": "int32"
+        },
+        "#C_Buffer": {
+            "size": 16,
+            "data_type": "int32"
+        }
+    },
+    "dma_fifo": {
+        "din": {
+            "id": "0",
+            "data_type": "int32",
+            "read": true,
+            "write": false
+        },
+        "dout": {
+            "id": "1",
+            "data_type": "int32",
+            "read": false,
+            "write": true
+        }
+    },
+    "kernels": {
+        "4x4_MM": {
+            "id": 0,
+            "description": "4x4 matrix multiplication",
+            "compute": "C += A * B",
+            "tile_info": {
+                "tile_dims": {
+                    "#N": 4,
+                    "#M": 4,
+                    "#K": 4
+                },
+                "A": {
+                    "associated_buffer": "#A_Buffer",
+                    "read": true,
+                    "write": false,
+                    "default_offset": 0,
+                    "shape": [
+                        "#N",
+                        "#K"
+                    ]
+                },
+                "B": {
+                    "associated_buffer": "#B_Buffer",
+                    "read": true,
+                    "write": false,
+                    "default_offset": 0,
+                    "shape": [
+                        "#M",
+                        "#K"
+                    ],
+                    "stationary": true
+                },
+                "C": {
+                    "associated_buffer": "#C_Buffer",
+                    "read": true,
+                    "write": true,
+                    "default_offset": 0,
+                    "shape": [
+                        "#N",
+                        "#M"
+                    ]
+                }
+            }
+        }
+    },
+    "ISA": {
+        "instruction_format": {
+            "opcode_length": 32,
+            "op_args": 0
+        },
+        "opcodes": {
+            "-": [],
+            "0": [],
+            "1": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#A_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                }
+            ],
+            "2": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#B_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                }
+            ],
+            "3": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#A_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                },
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#B_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                }
+            ],
+            "4": [
+                {
+                    "COMPUTE": {
+                        "kernel_id": 0
+                    }
+                }
+            ],
+            "5": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#A_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                },
+                {
+                    "COMPUTE": {
+                        "kernel_id": 0
+                    }
+                }
+            ],
+            "6": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#B_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                },
+                {
+                    "COMPUTE": {
+                        "kernel_id": 0
+                    }
+                }
+            ],
+            "7": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#A_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                },
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#B_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                },
+                {
+                    "COMPUTE": {
+                        "kernel_id": 0
+                    }
+                }
+            ],
+            "8": [
+                {
+                    "SEND": {
+                        "dma_fifo_id": 1,
+                        "buffer": "#C_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                }
+            ],
+            "9": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#A_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                },                
+                {
+                    "SEND": {
+                        "dma_fifo_id": 1,
+                        "buffer": "#C_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                }
+            ],
+            "10": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#B_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                },                
+                {
+                    "SEND": {
+                        "dma_fifo_id": 1,
+                        "buffer": "#C_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                }
+            ],
+            "11": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#A_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                },
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#B_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                },                
+                {
+                    "SEND": {
+                        "dma_fifo_id": 1,
+                        "buffer": "#C_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                }
+            ],
+            "12": [
+                {
+                    "COMPUTE": {
+                        "kernel_id": 0
+                    }
+                },
+                {
+                    "SEND": {
+                        "dma_fifo_id": 1,
+                        "buffer": "#C_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                }           
+            ],
+            "13": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#A_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                },
+                {
+                    "COMPUTE": {
+                        "kernel_id": 0
+                    }
+                },
+                {
+                    "SEND": {
+                        "dma_fifo_id": 1,
+                        "buffer": "#C_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                }
+            ],
+            "14": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#B_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                },
+                {
+                    "COMPUTE": {
+                        "kernel_id": 0
+                    }
+                },
+                {
+                    "SEND": {
+                        "dma_fifo_id": 1,
+                        "buffer": "#C_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                }
+            ],
+            "15": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#A_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                },
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#B_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                },
+                {
+                    "COMPUTE": {
+                        "kernel_id": 0
+                    }
+                },
+                {
+                    "SEND": {
+                        "dma_fifo_id": 1,
+                        "buffer": "#C_Buffer",
+                        "offset": 0,
+                        "length": 16
+                    }
+                }
+            ]
+
+
+        }
+    }
+}
\ No newline at end of file
diff --git a/include/soda/ExecutionEngine/axi/accelerators/mmt_4x4_v4/accelerator.sc.h b/include/soda/ExecutionEngine/axi/accelerators/mmt_4x4_v4/accelerator.sc.h
new file mode 100644
index 0000000..6907476
--- /dev/null
+++ b/include/soda/ExecutionEngine/axi/accelerators/mmt_4x4_v4/accelerator.sc.h
@@ -0,0 +1,262 @@
+#ifndef ACC_H
+#define ACC_H
+
+#include "../dma_engine.sc.h"
+#define ACCNAME MM_4x4v4
+
+// #define VERBOSE_ACC
+#ifdef VERBOSE_ACC
+#define ALOG(x) std::cout << x << std::endl
+#else
+#define ALOG(x)
+#endif
+
+// OP-Code Stuct
+// 0000 : 0 = NOP;
+// 0001 : 1 = read_A;
+// 0010 : 2 = read_B;
+// 0011 : 3 = read_A -> read_B;
+// 0100 : 4 = compute_C;
+// 0101 : 5 = read_A -> compute_C;
+// 0110 : 6 = read_B -> compute_C;
+// 0111 : 7 = read_A -> read_B -> compute_C;
+
+// 1000 : 8 = send_C;
+// 1001 : 9 = read_A -> send_C;
+// 1010 : 10 = read_B -> send_C;
+// 1011 : 11 = read_A -> read_B -> send_C;
+// 1100 : 12 = compute_C -> send_C;
+// 1101 : 13 = read_A -> compute_C -> send_C;
+// 1110 : 14 = read_B -> compute_C -> send_C;
+// 1111 : 15 = read_A -> read_B -> compute_C -> send_C;
+
+struct opcode {
+  unsigned int packet;
+  bool read_A;
+  bool read_B;
+  bool compute_C;
+  bool send_C;
+
+  opcode(sc_uint<32> _packet) {
+    ALOG("OPCODE: " << _packet);
+    ALOG("Time: " << sc_time_stamp());
+    packet = _packet;
+    read_A = _packet.range(0, 0);
+    read_B = _packet.range(1, 1);
+    compute_C = _packet.range(2, 2);
+    send_C = _packet.range(3, 3);
+  }
+};
+
+struct code_extension {
+  sc_uint<16> N;
+  sc_uint<16> M;
+  sc_uint<32> K;
+
+  code_extension(sc_uint<32> _packetA, sc_uint<32> _packetB) {
+    N = _packetA.range(15, 0);
+    M = _packetA.range(31, 16);
+    K = _packetB.range(31, 0);
+
+    ALOG("Time: " << sc_time_stamp());
+    ALOG("N: " << N << ", M: " << M << ", K: " << K);
+  }
+};
+
+SC_MODULE(ACCNAME) {
+  sc_in<bool> clock;
+  sc_in<bool> reset;
+  sc_int<32> A_buffer[4096];
+  sc_int<32> B_buffer[4096];
+  sc_int<32> C_buffer[4096];
+  sc_fifo_in<DATA> din1;
+  sc_fifo_out<DATA> dout1;
+
+  // Debug variables
+  int process_blocks;
+  int read_A_len;
+  int read_B_len;
+  int compute_C_len;
+  int send_C_len;
+  bool verbose;
+
+#ifndef __SYNTHESIS__
+  sc_signal<bool, SC_MANY_WRITERS> compute;
+  sc_signal<bool, SC_MANY_WRITERS> send;
+#else
+  sc_signal<bool> compute;
+  sc_signal<bool> send;
+#endif
+
+  code_extension acc_args = code_extension(0, 0);
+
+  void Recv();
+
+  void Compute(int, int, int, int, int);
+
+  void Schedule_Compute();
+
+  void Send();
+
+  void print_profile();
+
+  SC_HAS_PROCESS(ACCNAME);
+
+  ACCNAME(sc_module_name name_) : sc_module(name_) {
+    SC_CTHREAD(Recv, clock.pos());
+    reset_signal_is(reset, true);
+
+    SC_CTHREAD(Schedule_Compute, clock.pos());
+    reset_signal_is(reset, true);
+
+    SC_CTHREAD(Send, clock.pos());
+    reset_signal_is(reset, true);
+
+    process_blocks = 0;
+    read_A_len = 0;
+    read_B_len = 0;
+    compute_C_len = 0;
+    send_C_len = 0;
+    verbose = false;
+  }
+};
+
+template <typename Integer>
+void accelerator_dma_connect(ACCNAME *acc, DMA_DRIVER *dmad,
+                             int _dma_input_buffer_size,
+                             int _dma_output_buffer_size) {
+
+  static sc_clock clk_fast("ClkFast", 1, SC_NS);
+  static sc_signal<bool> sig_reset;
+  static sc_fifo<DATA> din1("din1_fifo", _dma_input_buffer_size);
+  static sc_fifo<DATA> dout1("dout1_fifo", _dma_output_buffer_size);
+
+  acc->clock(clk_fast);
+  acc->reset(sig_reset);
+  acc->dout1(dout1);
+  acc->din1(din1);
+
+  dmad->clock(clk_fast);
+  dmad->reset(sig_reset);
+  dmad->dout1(dout1);
+  dmad->din1(din1);
+}
+
+void ACCNAME::print_profile() {
+  cout << "++++++++++++++++++++++++++++++++++++++++" << endl;
+  cout << "Read A data_len: " << read_A_len << endl;
+  cout << "Read B data_len: " << read_B_len << endl;
+  cout << "MACs count: " << compute_C_len << endl;
+  cout << "Send C data_len: " << send_C_len << endl;
+  cout << "++++++++++++++++++++++++++++++++++++++++" << endl;
+  cout << "Executed with :" << __FILE__ << endl;
+  cout << "- - - - - - - - - - - - - - - - - - - - " << endl;
+}
+
+void ACCNAME::Recv() {
+  wait();
+  while (1) {
+    opcode packet(din1.read().data);
+    code_extension op_args(din1.read().data, din1.read().data);
+    acc_args = op_args;
+
+    if (packet.read_A) {
+      unsigned int read_length = op_args.N * op_args.K;
+      for (int i = 0; i < read_length; i++) {
+        A_buffer[i] = din1.read().data;
+        read_A_len++;
+        DWAIT();
+      }
+    }
+
+    if (packet.read_B) {
+      unsigned int read_length = op_args.M * op_args.K;
+      for (int i = 0; i < read_length; i++) {
+        B_buffer[i] = din1.read().data;
+        read_B_len++;
+        DWAIT();
+      }
+    }
+
+    // Computes C if true
+    if (packet.compute_C) {
+      compute.write(true);
+      wait();
+    }
+
+    while (compute)
+      wait();
+
+    // Sends then clears C if true
+    if (packet.send_C) {
+      send.write(true);
+      wait();
+    }
+
+    while (send)
+      wait();
+
+    wait();
+  }
+}
+
+void ACCNAME::Compute(int N, int M, int K, int in_stride, int out_stride) {
+  for (int n = 0; n < 4; n++) {
+    for (int m = 0; m < 4; m++) {
+      int acc = 0;
+      for (int k = 0; k < 4; k++) {
+        int a_data = A_buffer[(N + n) * in_stride + K + k];
+        int b_data = B_buffer[(M + m) * in_stride + K + k];
+        acc += a_data * b_data;
+        compute_C_len++;
+      }
+      C_buffer[(N + n) * out_stride + M + m] += acc;
+    }
+  }
+}
+
+void ACCNAME::Schedule_Compute() {
+  wait();
+  while (1) {
+    while (!compute)
+      wait();
+
+    for (int n = 0; n < acc_args.N; n += 4) {
+      for (int m = 0; m < acc_args.M; m += 4) {
+        for (int k = 0; k < acc_args.K; k += 4) {
+          Compute(n, m, k, acc_args.K, acc_args.M);
+        }
+      }
+    }
+
+    wait();
+    compute.write(false);
+    wait();
+  }
+}
+
+void ACCNAME::Send() {
+  wait();
+  while (1) {
+    while (!send)
+      wait();
+
+    for (int n = 0; n < acc_args.N; n++) {
+      for (int m = 0; m < acc_args.M; m++) {
+        DATA d;
+        d.tlast = false;
+        d.data = C_buffer[n * acc_args.M + m];
+        if (n + 1 == acc_args.N && m + 1 == acc_args.M)
+          d.tlast = true;
+        dout1.write(d);
+        C_buffer[n * acc_args.M + m] = 0;
+        send_C_len++;
+        DWAIT();
+      }
+    }
+    send.write(false);
+    wait();
+  }
+}
+
+#endif
diff --git a/include/soda/ExecutionEngine/axi/accelerators/mmt_4x4_v4/mm_4x4_v4.json b/include/soda/ExecutionEngine/axi/accelerators/mmt_4x4_v4/mm_4x4_v4.json
new file mode 100644
index 0000000..1be763f
--- /dev/null
+++ b/include/soda/ExecutionEngine/axi/accelerators/mmt_4x4_v4/mm_4x4_v4.json
@@ -0,0 +1,361 @@
+{
+    "name": "MM_4x4_v4",
+    "version": "1.0",
+    "description": "MM Accelerator",
+    "memory_layout": {
+        "#A_Buffer": {
+            "size": 4096,
+            "data_type": "int32"
+        },
+        "#B_Buffer": {
+            "size": 4096,
+            "data_type": "int32"
+        },
+        "#C_Buffer": {
+            "size": 4096,
+            "data_type": "int32"
+        }
+    },
+    "dma_fifo": {
+        "din": {
+            "id": "0",
+            "data_type": "int32",
+            "read": true,
+            "write": false
+        },
+        "dout": {
+            "id": "1",
+            "data_type": "int32",
+            "read": false,
+            "write": true
+        }
+    },
+    "kernels": {
+        "4x4_MM": {
+            "id": 0,
+            "description": "4x4 matrix multiplication",
+            "compute": "C += A * B",
+            "tile_info": {
+                "tile_dims": {
+                    "#tile_N": 4,
+                    "#tile_M": 4,
+                    "#tile_K": 4
+                },
+                "A": {
+                    "associated_buffer": "#A_Buffer",
+                    "read": true,
+                    "write": false,
+                    "default_offset": 0,
+                    "shape": [
+                        "#tile_N",
+                        "#tile_K"
+                    ]
+                },
+                "B": {
+                    "associated_buffer": "#B_Buffer",
+                    "read": true,
+                    "write": false,
+                    "default_offset": 0,
+                    "shape": [
+                        "#tile_M",
+                        "#tile_K"
+                    ],
+                    "stationary": true
+                },
+                "C": {
+                    "associated_buffer": "#C_Buffer",
+                    "read": true,
+                    "write": true,
+                    "default_offset": 0,
+                    "shape": [
+                        "#tile_N",
+                        "#tile_M"
+                    ]
+                }
+            }
+        }
+    },
+    "ISA": {
+        "instruction_format": {
+            "opcode_length": 32,
+            "op_extension": 64
+        },
+        "opcodes": {
+            "-": [],
+            "0": [],
+            "1": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#A_Buffer",
+                        "offset": 0,
+                        "length": "#N * #K"
+                    }
+                }
+            ],
+            "2": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#B_Buffer",
+                        "offset": 0,
+                        "length": "#M * #K"
+                    }
+                }
+            ],
+            "3": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#A_Buffer",
+                        "offset": 0,
+                        "length": "#N * #K"
+                    }
+                },
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#B_Buffer",
+                        "offset": 0,
+                        "length": "#M * #K"
+                    }
+                }
+            ],
+            "4": [
+                {
+                    "COMPUTE": {
+                        "kernel_id": 0
+                    }
+                }
+            ],
+            "5": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#A_Buffer",
+                        "offset": 0,
+                        "length": "#N * #K"
+                    }
+                },
+                {
+                    "COMPUTE": {
+                        "kernel_id": 0
+                    }
+                }
+            ],
+            "6": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#B_Buffer",
+                        "offset": 0,
+                        "length": "#M * #K"
+                    }
+                },
+                {
+                    "COMPUTE": {
+                        "kernel_id": 0
+                    }
+                }
+            ],
+            "7": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#A_Buffer",
+                        "offset": 0,
+                        "length": "#N * #K"
+                    }
+                },
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#B_Buffer",
+                        "offset": 0,
+                        "length": "#M * #K"
+                    }
+                },
+                {
+                    "COMPUTE": {
+                        "kernel_id": 0
+                    }
+                }
+            ],
+            "8": [
+                {
+                    "SEND": {
+                        "dma_fifo_id": 1,
+                        "buffer": "#C_Buffer",
+                        "offset": 0,
+                        "length": "#N * #M"
+                    }
+                }
+            ],
+            "9": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#A_Buffer",
+                        "offset": 0,
+                        "length": "#N * #K"
+                    }
+                },
+                {
+                    "SEND": {
+                        "dma_fifo_id": 1,
+                        "buffer": "#C_Buffer",
+                        "offset": 0,
+                        "length": "#N * #M"
+                    }
+                }
+            ],
+            "10": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#B_Buffer",
+                        "offset": 0,
+                        "length": "#M * #K"
+                    }
+                },
+                {
+                    "SEND": {
+                        "dma_fifo_id": 1,
+                        "buffer": "#C_Buffer",
+                        "offset": 0,
+                        "length": "#N * #M"
+                    }
+                }
+            ],
+            "11": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#A_Buffer",
+                        "offset": 0,
+                        "length": "#N * #K"
+                    }
+                },
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#B_Buffer",
+                        "offset": 0,
+                        "length": "#M * #K"
+                    }
+                },
+                {
+                    "SEND": {
+                        "dma_fifo_id": 1,
+                        "buffer": "#C_Buffer",
+                        "offset": 0,
+                        "length": "#N * #M"
+                    }
+                }
+            ],
+            "12": [
+                {
+                    "COMPUTE": {
+                        "kernel_id": 0
+                    }
+                },
+                {
+                    "SEND": {
+                        "dma_fifo_id": 1,
+                        "buffer": "#C_Buffer",
+                        "offset": 0,
+                        "length": "#N * #M"
+                    }
+                }
+            ],
+            "13": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#A_Buffer",
+                        "offset": 0,
+                        "length": "#N * #K"
+                    }
+                },
+                {
+                    "COMPUTE": {
+                        "kernel_id": 0
+                    }
+                },
+                {
+                    "SEND": {
+                        "dma_fifo_id": 1,
+                        "buffer": "#C_Buffer",
+                        "offset": 0,
+                        "length": "#N * #M"
+                    }
+                }
+            ],
+            "14": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#B_Buffer",
+                        "offset": 0,
+                        "length": "#M * #K"
+                    }
+                },
+                {
+                    "COMPUTE": {
+                        "kernel_id": 0
+                    }
+                },
+                {
+                    "SEND": {
+                        "dma_fifo_id": 1,
+                        "buffer": "#C_Buffer",
+                        "offset": 0,
+                        "length": "#N * #M"
+                    }
+                }
+            ],
+            "15": [
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#A_Buffer",
+                        "offset": 0,
+                        "length": "#N * #K"
+                    }
+                },
+                {
+                    "READ": {
+                        "dma_fifo_id": 0,
+                        "buffer": "#B_Buffer",
+                        "offset": 0,
+                        "length": "#M * #K"
+                    }
+                },
+                {
+                    "COMPUTE": {
+                        "kernel_id": 0
+                    }
+                },
+                {
+                    "SEND": {
+                        "dma_fifo_id": 1,
+                        "buffer": "#C_Buffer",
+                        "offset": 0,
+                        "length": "#N * #M"
+                    }
+                }
+            ]
+        },
+        "op_arg": {
+            "0-15": "#N",
+            "16-31": "#M",
+            "31-63": "#K"
+        }
+    },
+    "schedule": {
+        "allowed_patterns": [
+            "R#a, R#b, C, S#c",
+            "a"
+        ]
+    }
+}
\ No newline at end of file
diff --git a/include/soda/ExecutionEngine/axi/api_v0.h b/include/soda/ExecutionEngine/axi/api_v0.h
new file mode 100644
index 0000000..fcd15c5
--- /dev/null
+++ b/include/soda/ExecutionEngine/axi/api_v0.h
@@ -0,0 +1,110 @@
+//**********************Deprecated**********************
+
+#ifndef AXI_APIv0
+#define AXI_APIv0
+
+#include <cstdlib>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+
+// API Model = One DMA is allocated with an input and an output buffer
+// TODO: Struct based representation of API model
+
+struct dma {
+#define MM2S_CONTROL_REGISTER 0x00
+#define MM2S_STATUS_REGISTER 0x04
+#define MM2S_START_ADDRESS 0x18
+#define MM2S_LENGTH 0x28
+
+#define S2MM_CONTROL_REGISTER 0x30
+#define S2MM_STATUS_REGISTER 0x34
+#define S2MM_DESTINATION_ADDRESS 0x48
+#define S2MM_LENGTH 0x58
+
+  unsigned int id;
+  unsigned int *dma_address;
+  unsigned int *dma_input_addr;
+  unsigned int *dma_output_addr;
+
+  unsigned int dma_input_len;
+  unsigned int dma_output_len;
+
+  void init(int id);
+
+  void dma_set(unsigned int *dma_virtual_address, int offset,
+                       unsigned int value) {
+    dma_virtual_address[offset >> 2] = value;
+  }
+
+  unsigned int dma_get(unsigned int *dma_virtual_address, int offset) {
+    return dma_virtual_address[offset >> 2];
+  }
+};
+
+struct dma_collection {
+
+  // Variables
+  int dma_count;
+  struct dma *dma_list;
+
+  //-----------------DMA Functions-----------------
+  /**
+   * dma_address is base address of dma
+   * dma_input_addr is starting memory location for the dma input buffer,
+   * dma_input_len is length of the buffer dma_output_addr is starting memory
+   * location for the dma output buffer, dma_output_len is length of the buffer
+   * Memory maps dma's base address
+   * Runs starting controls signals and sets MMS2, S2MM address registers to
+   * start memory locations of the input and output buffers
+   */
+  void dma_init(int dma_count, unsigned int *dma_address,
+                unsigned int *dma_input_addr, unsigned int *dma_input_len,
+                unsigned int *dma_output_addr, unsigned int *dma_output_len);
+
+  // Memory unmaps DMA base addresses and Input and output buffers
+  void dma_free();
+
+  // Get base address for dma represented by dma_id,
+  unsigned int *dma_get_regaddr();
+
+  //-----------------BUFFER Functions-----------------
+  // Get the MMap address of the input buffer of the dma
+  unsigned int *dma_get_inbuffer();
+
+  // Get the MMap address of the output buffer of the dma
+  unsigned int *dma_get_outbuffer();
+
+  //-----------------DMA MMS2 Functions-----------------
+  /**
+   * Checks if input buffer size is >= length
+   * Sets DMA MMS2 transfer length to length
+   * Starts transfers to the accelerator using dma associated with dma_id
+   * Return 0 if successful, returns negative if error occurs
+   */
+  int dma_set_transfer(int dma_id, int length);
+
+  // Blocks thread until dma MMS2 transfer is complete
+  void dma_send(int dma_id, int buffer_ID, int length);
+
+  // Same as dma_send but thread does not block, returns if 0
+  int dma_send_nb(int dma_id, int buffer_ID, int length);
+
+  //-----------------DMA S2MM Functions-----------------
+  /**
+   * Checks if buffer size is >= length
+   * Sets 2SMM store length
+   * Starts storing data recieved through dma associated with dma_id
+   * Return 0 if successful, returns negative if error occurs
+   */
+  int dma_set_store(int dma_id, int buffer_ID, int length);
+
+  // Blocks thread until dma S2MM transfer is complete (TLAST signal is seen)
+  void dma_recv(int dma_id, int buffer_ID, int length);
+
+  // Same as dma_recv but thread does not block, returns if 0
+  int dma_recv_nb(int dma_id, int buffer_ID, int length);
+};
+
+#endif
\ No newline at end of file
diff --git a/include/soda/ExecutionEngine/axi/api_v1.h b/include/soda/ExecutionEngine/axi/api_v1.h
new file mode 100644
index 0000000..03bfd61
--- /dev/null
+++ b/include/soda/ExecutionEngine/axi/api_v1.h
@@ -0,0 +1,200 @@
+#ifndef AXI_APIv1
+#define AXI_APIv1
+
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <fcntl.h>
+#include <fstream>
+#include <iostream>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#ifdef SYSC
+// Easy way to switch between systemC accelerators --- there is probably a
+// better way
+
+#ifdef CONV_V1
+#include "soda/ExecutionEngine/axi/accelerators/conv_v1/accelerator.sc.h"
+#elif ACC_V5
+#include "soda/ExecutionEngine/axi/accelerators/mm_4x4_v5/accelerator.sc.h"
+#elif ACC_V4
+#include "soda/ExecutionEngine/axi/accelerators/mm_4x4_v4/accelerator.sc.h"
+#elif ACC_V3
+#include "soda/ExecutionEngine/axi/accelerators/mm_4x4_v3/accelerator.sc.h"
+#elif ACC_V2
+#include "soda/ExecutionEngine/axi/accelerators/mm_4x4_v2/accelerator.sc.h"
+#else
+#include "soda/ExecutionEngine/axi/accelerators/mm_4x4_v1/accelerator.sc.h"
+#endif
+#endif
+
+// API Model = One DMA is allocated with a single input and output buffer (Can
+// have different size)
+
+// Simple view of DMA
+/*
+dma -> {
+    control_register_address : unsigned int     # Mapped address to the start of
+the DMA control registers Buffer input_buffer (address,size) Buffer
+output_buffer  (address,size)
+}
+*/
+
+struct dma {
+#define MM2S_CONTROL_REGISTER 0x00
+#define MM2S_STATUS_REGISTER 0x04
+#define MM2S_START_ADDRESS 0x18
+#define MM2S_LENGTH 0x28
+#define S2MM_CONTROL_REGISTER 0x30
+#define S2MM_STATUS_REGISTER 0x34
+#define S2MM_DESTINATION_ADDRESS 0x48
+#define S2MM_LENGTH 0x58
+#define PAGE_SIZE getpagesize()
+
+#define m_assert(expr, msg) assert(((void)(msg), (expr)))
+
+// Define this variable for additional profiling info (in api_v1_sysc.cpp)
+#define PROFILE
+#ifdef PROFILE
+#define PLOG(x) std::cout << x << std::endl
+#define PFUNC(x) x
+#else
+// Safer option that requires a semicolon, but relies on compiler to be removed
+// #define PLOG(x) do { } while(0)
+// #define PFUNC(x) do { } while(0)
+#define PLOG(x)
+#define PFUNC(x)
+#endif
+
+// Define this variable for additional debug info
+// #define VERBOSE_AXI
+#ifdef VERBOSE_AXI
+#define D(x)                                                                   \
+  do {                                                                         \
+    x                                                                          \
+  } while (0)
+#define LOG(x) std::cout << x << std::endl
+#else
+// Safer option that requires a semicolon, but relies on compiler to be removed
+// #define D(x) do { } while(0)
+// #define LOG(x) do { } while(0)
+#define D(x)
+#define LOG(x)
+#endif
+
+  unsigned int *dma_address;
+  unsigned int *dma_input_address;
+  unsigned int *dma_output_address;
+  unsigned int dma_input_buffer_size;
+  unsigned int dma_output_buffer_size;
+  unsigned int dma_input_paddress;
+  unsigned int dma_output_paddress;
+  unsigned int *acc_address;
+  unsigned int current_input_offset;
+
+  // Profiling Variables
+  unsigned int dma_send_length = 0;
+  unsigned int dma_recv_length = 0;
+  unsigned int dma_send_count = 0;
+  unsigned int dma_recv_count = 0;
+
+  // temp --- need to remove later
+  bool verbose = false;
+
+#ifdef SYSC
+  ACCNAME *acc;
+  DMA_DRIVER *dmad;
+#endif
+
+  void dma_init(unsigned int dma_address, unsigned int dma_input_address,
+                unsigned int dma_input_buffer_size,
+                unsigned int dma_output_address,
+                unsigned int dma_output_buffer_size);
+
+  // Memory unmaps DMA control_register_address and Input and output buffers
+  void dma_free();
+
+  // We could reduce to one set of the following calls
+  //================================================================================================================
+
+  //-----------------BUFFER Functions-----------------
+  // Get the MMap address of the input buffer of the dma  *Needed to copy data
+  // to Input_Buffer*
+  unsigned int *dma_get_inbuffer();
+
+  // Get the MMap address of the output buffer of the dma *Needed to copy data
+  // from Output_Buffer*
+  unsigned int *dma_get_outbuffer();
+
+  //================================================================================================================
+
+  //-----------------BUFFER Functions-----------------
+  // Copy data into the Input Buffer (length to write, offset to write to)
+  // returns 0 if successful
+  int dma_copy_to_inbuffer(unsigned int *host_src_address, int data_length,
+                           int offset);
+
+  template <typename T>
+  int mlir_dma_copy_to_inbuffer(T *mr_base, int64_t mr_dim, int64_t mr_rank,
+                                int64_t mr_offset, const int64_t *mr_sizes,
+                                const int64_t *mr_strides, int dma_offset);
+
+  // Copy data from the Output Buffer (length to read, offset to read from)
+  // returns 0 if successful
+  int dma_copy_from_outbuffer(unsigned int *host_dst_address, int data_length,
+                              int offset);
+
+  template <typename T>
+  int mlir_dma_copy_from_outbuffer(T *mr_base, int64_t mr_dim, int64_t mr_rank,
+                                   int64_t mr_offset, const int64_t *mr_sizes,
+                                   const int64_t *mr_strides, int dma_offset);
+
+  //============================================================================
+
+  //-----------------DMA MMS2 Functions-----------------
+  /**
+   * Checks if input buffer size is >= length
+   * Sets DMA MMS2 transfer length to length
+   * Starts transfers to the accelerator using dma associated with dma_id
+   * Return 0 if successful, returns negative if error occurs
+   */
+  int dma_start_send(int length, int offset);
+
+  // Blocks thread until dma MMS2 transfer is complete
+  void dma_wait_send();
+
+  // Same as dma_send but thread does not block, returns 0 if done
+  int dma_check_send();
+
+  //-----------------DMA S2MM Functions-----------------
+  /**
+   * Checks if buffer size is >= length
+   * Sets 2SMM store length
+   * Starts storing data recieved through dma associated with dma_id
+   * Return 0 if successful, returns negative if error occurs
+   */
+  int dma_start_recv(int length, int offset);
+
+  // Blocks thread until dma S2MM transfer is complete (TLAST signal is seen)
+  void dma_wait_recv();
+
+  // Same as dma_recv but thread does not block, returns 0 if done
+  int dma_check_recv();
+
+  //********************************** Unexposed Functions
+  //**********************************
+  void initDMAControls();
+  void dma_set(unsigned int *dma_virtual_address, int offset,
+               unsigned int value);
+  unsigned int dma_get(unsigned int *dma_virtual_address, int offset);
+  void dma_mm2s_sync();
+  void dma_s2mm_sync();
+  void acc_init(unsigned int base_addr, int length);
+  void dump_acc_signals(int state);
+};
+
+#endif
\ No newline at end of file
diff --git a/include/soda/ExecutionEngine/axi/api_v2.h b/include/soda/ExecutionEngine/axi/api_v2.h
new file mode 100644
index 0000000..bdb25f8
--- /dev/null
+++ b/include/soda/ExecutionEngine/axi/api_v2.h
@@ -0,0 +1,217 @@
+#ifndef AXI_APIv2
+#define AXI_APIv2
+
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <fcntl.h>
+#include <fstream>
+#include <iostream>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#ifdef SYSC
+// Easy way to switch between systemC accelerators --- there is probably a
+// better way
+
+#ifdef CONV_V1
+#include "soda/ExecutionEngine/axi/accelerators/conv_v1/accelerator.sc.h"
+#elif ACC_V5
+#include "soda/ExecutionEngine/axi/accelerators/mm_4x4_v5/accelerator.sc.h"
+#elif ACC_V4
+#include "soda/ExecutionEngine/axi/accelerators/mm_4x4_v4/accelerator.sc.h"
+#elif ACC_V3
+#include "soda/ExecutionEngine/axi/accelerators/mm_4x4_v3/accelerator.sc.h"
+#elif ACC_V2
+#include "soda/ExecutionEngine/axi/accelerators/mm_4x4_v2/accelerator.sc.h"
+#else
+#include "soda/ExecutionEngine/axi/accelerators/mm_4x4_v1/accelerator.sc.h"
+#endif
+#endif
+
+// API Model = One DMA is allocated with a single input and output buffer (Can
+// have different size)
+
+// clang-format off
+// Simple view of DMA
+/*
+dma -> {
+    control_register_address : unsigned int # Mapped address to the start of 
+                                              the DMA control registers 
+    Buffer input_buffer  (address,length,size_of_element) 
+    Buffer output_buffer (address,length,size_of_element)
+    All lengths are in elements
+    All offsets are in elements
+    The library will handle the conversion to bytes
+    Pointer access to the buffer returns char*
+    Once, within dma_init, MLIR/external user needs to specify size of the I/O elements
+}
+*/
+// clang-format on
+
+struct dma {
+#define MM2S_CONTROL_REGISTER 0x00
+#define MM2S_STATUS_REGISTER 0x04
+#define MM2S_START_ADDRESS 0x18
+#define MM2S_LENGTH 0x28
+#define S2MM_CONTROL_REGISTER 0x30
+#define S2MM_STATUS_REGISTER 0x34
+#define S2MM_DESTINATION_ADDRESS 0x48
+#define S2MM_LENGTH 0x58
+#define PAGE_SIZE getpagesize()
+
+#define m_assert(expr, msg) assert(((void)(msg), (expr)))
+
+// Define this variable for additional profiling info (in api_v1_sysc.cpp)
+#define PROFILE
+#ifdef PROFILE
+#define PLOG(x) std::cout << x << std::endl
+#define PFUNC(x) x
+#else
+// Safer option that requires a semicolon, but relies on compiler to be removed
+// #define PLOG(x) do { } while(0)
+// #define PFUNC(x) do { } while(0)
+#define PLOG(x)
+#define PFUNC(x)
+#endif
+
+// Define this variable for additional debug info
+// #define VERBOSE_AXI
+#ifdef VERBOSE_AXI
+#define D(x)                                                                   \
+  do {                                                                         \
+    x                                                                          \
+  } while (0)
+#define LOG(x) std::cout << x << std::endl
+#else
+// Safer option that requires a semicolon, but relies on compiler to be removed
+// #define D(x) do { } while(0)
+// #define LOG(x) do { } while(0)
+#define D(x)
+#define LOG(x)
+#endif
+  // I/O addresses are in type char or handled with type char size
+  // I/O lengths are in elements
+  unsigned int *dma_address;
+  char *dma_input_address;
+  char *dma_output_address;
+  unsigned int dma_input_buffer_size;
+  unsigned int dma_output_buffer_size;
+  unsigned int isize;
+  unsigned int osize;
+
+  // These addresses are in physical memory
+  unsigned int dma_input_paddress;
+  unsigned int dma_output_paddress;
+
+  // Maybe remove
+  unsigned int *acc_address;
+  // unsigned int current_input_offset;
+
+  // Profiling Variables
+  unsigned int dma_send_length = 0;
+  unsigned int dma_recv_length = 0;
+  unsigned int dma_send_count = 0;
+  unsigned int dma_recv_count = 0;
+
+  // temp --- need to remove later
+  bool verbose = false;
+
+#ifdef SYSC
+  ACCNAME *acc;
+  DMA_DRIVER *dmad;
+#endif
+
+  void dma_init(unsigned int dma_address, unsigned int dma_input_address,
+                unsigned int dma_input_buffer_size, unsigned int isize,
+                unsigned int dma_output_address,
+                unsigned int dma_output_buffer_size, unsigned int osize);
+
+  // Memory unmaps DMA control_register_address and Input and output buffers
+  void dma_free();
+
+  // We could reduce to one set of the following calls
+  //================================================================================================================
+
+  //-----------------BUFFER Functions-----------------
+  // Get the MMap address of the input buffer of the dma  *Needed to copy data
+  // to Input_Buffer*
+  char *dma_get_inbuffer();
+
+  // Get the MMap address of the output buffer of the dma *Needed to copy data
+  // from Output_Buffer*
+
+  char *dma_get_outbuffer();
+
+  //================================================================================================================
+
+  //-----------------BUFFER Functions-----------------
+  // Copy data into the Input Buffer (length to write, offset to write to)
+  // returns 0 if successful
+  // int dma_copy_to_inbuffer(unsigned int *host_src_address, int data_length,
+  //                          int offset);
+
+  // Copy data from the Output Buffer (length to read, offset to read from)
+  // returns 0 if successful
+  // int dma_copy_from_outbuffer(unsigned int *host_dst_address, int
+  // data_length,
+  //                             int offset);
+
+  template <typename T>
+  int mlir_dma_copy_to_inbuffer(T *mr_base, int64_t mr_dim, int64_t mr_rank,
+                                int64_t mr_offset, const int64_t *mr_sizes,
+                                const int64_t *mr_strides, int dma_offset);
+
+  template <typename T>
+  int mlir_dma_copy_from_outbuffer(T *mr_base, int64_t mr_dim, int64_t mr_rank,
+                                   int64_t mr_offset, const int64_t *mr_sizes,
+                                   const int64_t *mr_strides, int dma_offset);
+
+  //============================================================================
+
+  //-----------------DMA MMS2 Functions-----------------
+  /**
+   * Checks if input buffer size is >= length
+   * Sets DMA MMS2 transfer length to length
+   * Starts transfers to the accelerator using dma associated with dma_id
+   * Return 0 if successful, returns negative if error occurs
+   */
+  int dma_start_send(unsigned int length, unsigned int offset);
+
+  // Blocks thread until dma MMS2 transfer is complete
+  void dma_wait_send();
+
+  // Same as dma_send but thread does not block, returns 0 if done
+  int dma_check_send();
+
+  //-----------------DMA S2MM Functions-----------------
+  /**
+   * Checks if buffer size is >= length
+   * Sets 2SMM store length
+   * Starts storing data recieved through dma associated with dma_id
+   * Return 0 if successful, returns negative if error occurs
+   */
+  int dma_start_recv(unsigned int length, unsigned int offset);
+
+  // Blocks thread until dma S2MM transfer is complete (TLAST signal is seen)
+  void dma_wait_recv();
+
+  // Same as dma_recv but thread does not block, returns 0 if done
+  int dma_check_recv();
+
+  //********************************** Unexposed Functions
+  //**********************************
+  void initDMAControls();
+  void dma_set(unsigned int *dma_virtual_address, int offset,
+               unsigned int value);
+  unsigned int dma_get(unsigned int *dma_virtual_address, int offset);
+  void dma_mm2s_sync();
+  void dma_s2mm_sync();
+  void acc_init(unsigned int base_addr, int length);
+  void dump_acc_signals(int state);
+};
+
+#endif
\ No newline at end of file
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 897fbe7..d85bbf5 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_subdirectory(Misc)
 add_subdirectory(Dialect)
 add_subdirectory(Conversion)
-add_subdirectory(CAPI)
\ No newline at end of file
+add_subdirectory(CAPI)
+add_subdirectory(ExecutionEngine)
\ No newline at end of file
diff --git a/lib/Conversion/AccelToRuntime/AccelToAXI4MLIR.cpp b/lib/Conversion/AccelToRuntime/AccelToAXI4MLIR.cpp
new file mode 100644
index 0000000..1f1af95
--- /dev/null
+++ b/lib/Conversion/AccelToRuntime/AccelToAXI4MLIR.cpp
@@ -0,0 +1,440 @@
+//===- AccelToAXI4MLIR.cpp - Convert Accel to AXI4MLIR calls --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements lowering of Accel to AXI4MLIR calls
+//
+//===----------------------------------------------------------------------===//
+
+#include "soda/Conversion/AccelToRuntime/AccelToAXI4MLIR.h"
+
+#include "../PassDetail.h"
+
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "soda/Dialect/Accel/IR/Accel.h"
+
+#include "mlir/IR/BuiltinDialect.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+//===----------------------------------------------------------------------===//
+// AXI4MLIR Runtime C API declaration.
+//===----------------------------------------------------------------------===//
+static constexpr const char *kDmaInit = "dma_init";
+static constexpr const char *kDmaFree = "dma_free";
+static constexpr const char *kCopyToInbufferF32 = "copy_to_inbuffer_f32";
+static constexpr const char *kCopyFromOutbufferF32 = "copy_from_outbuffer_f32";
+static constexpr const char *kCopyToInbufferI32 = "copy_to_inbuffer_i32";
+static constexpr const char *kCopyFromOutbufferI32 = "copy_from_outbuffer_i32";
+static constexpr const char *kDmaStartSend = "dma_start_send";
+static constexpr const char *kDmaWaitSend = "dma_wait_send";
+static constexpr const char *kDmaStartRecv = "dma_start_recv";
+static constexpr const char *kDmaWaitRecv = "dma_wait_recv";
+
+using namespace mlir;
+using namespace mlir::func;
+
+class InitDMAToAXI4MLIRCall : public OpRewritePattern<accel::InitDMAOp> {
+public:
+  using OpRewritePattern<accel::InitDMAOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(accel::InitDMAOp op,
+                                PatternRewriter &rewriter) const override {
+
+    auto module = SymbolTable::getNearestSymbolTable(op);
+
+    auto name = kDmaInit;
+    auto opFunc = dyn_cast_or_null<SymbolOpInterface>(
+        SymbolTable::lookupSymbolIn(module, name));
+    // Forward declare function if it hasn't already been
+    if (!opFunc) { // TODO: Check dma_free
+      OpBuilder::InsertionGuard guard(rewriter);
+      rewriter.setInsertionPointToStart(&module->getRegion(0).front());
+
+      MLIRContext *ctx = rewriter.getContext();
+      Location uLoc = rewriter.getUnknownLoc();
+      Type intTy = rewriter.getI32Type();
+      FunctionType fType;
+
+      fType = FunctionType::get(ctx, {intTy, intTy, intTy, intTy, intTy}, {});
+      rewriter.create<FuncOp>(uLoc, name, fType).setPrivate();
+
+      fType = FunctionType::get(ctx, {}, {});
+      rewriter.create<FuncOp>(uLoc, kDmaFree, fType).setPrivate();
+    }
+    assert(isa<FunctionOpInterface>(SymbolTable::lookupSymbolIn(module, name)));
+
+    rewriter.replaceOpWithNewOp<CallOp>(op, name, /*TODO no type?*/ TypeRange(),
+                                        op->getOperands());
+    // TODO: this may create several DMA frees, but only one is needed
+    rewriter.setInsertionPoint(op->getBlock()->getTerminator());
+    rewriter.create<CallOp>(rewriter.getUnknownLoc(), kDmaFree,
+                            /*TODO no type?*/ TypeRange(), ValueRange());
+
+    return success();
+  }
+};
+
+// Forward declare functions for SendOp
+static void fwdDeclareSendFuncs(PatternRewriter &rewriter, Operation *module,
+                                Type intTy, Type mrTy) {
+
+  // TODO: Name has to match memref type
+  // TODO: This is giving the i32 name but memref may be f32
+  auto name = kCopyToInbufferI32;
+  auto opFunc = dyn_cast_or_null<SymbolOpInterface>(
+      SymbolTable::lookupSymbolIn(module, name));
+  if (!opFunc) { // TODO: check for the other function names
+    OpBuilder::InsertionGuard guard(rewriter);
+    rewriter.setInsertionPointToStart(&module->getRegion(0).front());
+
+    MLIRContext *ctx = rewriter.getContext();
+    Location uLoc = rewriter.getUnknownLoc();
+    FunctionType fType;
+
+    fType = FunctionType::get(ctx, {mrTy, intTy}, {intTy});
+    rewriter.create<FuncOp>(uLoc, name, fType).setPrivate();
+
+    fType = FunctionType::get(ctx, {intTy, intTy}, {intTy});
+    rewriter.create<FuncOp>(uLoc, kDmaStartSend, fType).setPrivate();
+
+    fType = FunctionType::get(ctx, {}, {});
+    rewriter.create<FuncOp>(uLoc, kDmaWaitSend, fType).setPrivate();
+  }
+  assert(isa<FunctionOpInterface>(SymbolTable::lookupSymbolIn(module, name)));
+}
+
+// Forward declare functions for RecvOp
+static void fwdDeclareRecvFuncs(PatternRewriter &rewriter, Operation *module,
+                                Type intTy, Type mrTy) {
+  auto name = kCopyFromOutbufferI32;
+  auto opFunc = dyn_cast_or_null<SymbolOpInterface>(
+      SymbolTable::lookupSymbolIn(module, name));
+  if (!opFunc) { // TODO: check for the other function names
+    OpBuilder::InsertionGuard guard(rewriter);
+    rewriter.setInsertionPointToStart(&module->getRegion(0).front());
+
+    MLIRContext *ctx = rewriter.getContext();
+    Location uLoc = rewriter.getUnknownLoc();
+    FunctionType fType;
+
+    fType = FunctionType::get(ctx, {mrTy, intTy}, {intTy});
+    rewriter.create<FuncOp>(uLoc, name, fType).setPrivate();
+
+    fType = FunctionType::get(ctx, {intTy, intTy}, {intTy});
+    rewriter.create<FuncOp>(uLoc, kDmaStartRecv, fType).setPrivate();
+
+    fType = FunctionType::get(ctx, {}, {});
+    rewriter.create<FuncOp>(uLoc, kDmaWaitRecv, fType).setPrivate();
+  }
+  assert(isa<FunctionOpInterface>(SymbolTable::lookupSymbolIn(module, name)));
+}
+
+// Create ops to get number of elements in dynamic sized SubViewOp
+static Value getNumElements(PatternRewriter &rewriter, Location loc,
+                            memref::SubViewOp subViewOp, MemRefType inputType,
+                            Type intTy) {
+  Value nElements;
+
+  SmallVector<Value, 4> sizes;
+  for (unsigned idx = 0; idx < inputType.getRank(); ++idx) {
+    sizes.push_back(subViewOp.getDynamicSizes()[idx]);
+  }
+
+  // Create as many arith::MulIOps as needed to calculate # of elements
+  nElements = sizes[0];
+  for (unsigned i = 1; i < inputType.getRank(); ++i) {
+    nElements = rewriter.create<arith::MulIOp>(loc, nElements, sizes[i]);
+  }
+  nElements = rewriter.create<arith::IndexCastOp>(loc, intTy, nElements);
+  return nElements;
+}
+
+class SendToAXI4MLIRCall : public OpRewritePattern<accel::SendOp> {
+public:
+  using OpRewritePattern<accel::SendOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(accel::SendOp op,
+                                PatternRewriter &rewriter) const override {
+
+    auto module = SymbolTable::getNearestSymbolTable(op);
+    Location loc = op->getLoc();
+
+    auto name = kCopyToInbufferI32;
+
+    Type intTy = rewriter.getI32Type();
+    Value input = op.getInput();
+    auto inputType = input.getType().dyn_cast_or_null<MemRefType>();
+    if (!inputType)
+      return failure();
+    auto myType = inputType.getElementType();
+    Type mrTy = UnrankedMemRefType::get(myType, 0);
+
+    fwdDeclareSendFuncs(rewriter, module, intTy, mrTy);
+
+    // TODO: Not sure if getOffestValue is working
+    auto initOffset = op.getOffsetValue();
+    if (!initOffset) {
+      initOffset =
+          rewriter.create<arith::ConstantOp>(loc, IntegerAttr::get(intTy, 0));
+    }
+
+    // Send flow: copy, start, wait
+    Value casted = rewriter.create<memref::CastOp>(loc, mrTy, input);
+    rewriter.create<CallOp>(loc, name, intTy,
+                            SmallVector<Value, 2>({casted, initOffset}));
+
+    int bitWidth = inputType.getElementTypeBitWidth();
+
+    // create a lambda function that uses isDynamicSize(idx) and returns true if
+    // one of the sizes is dynamic
+    if (inputType.hasStaticShape()) {
+      // llvm::errs() << "SendToAXI4MLIRCall: inputType has static shape\n";
+      int numElements = inputType.getNumElements();
+      // int bytes = numElements * bitWidth / 8;
+
+      Value nElements = rewriter.create<arith::ConstantOp>(
+          loc, IntegerAttr::get(intTy, numElements));
+      rewriter.create<CallOp>(loc, kDmaStartSend, intTy,
+                              SmallVector<Value, 2>({nElements, initOffset}));
+      rewriter.create<CallOp>(loc, kDmaWaitSend, TypeRange());
+
+      Value resultOffset = rewriter.create<arith::ConstantOp>(
+          loc, IntegerAttr::get(intTy, numElements));
+      rewriter.replaceOp(op, {resultOffset});
+    } else {
+      // llvm::errs() << "SendToAXI4MLIRCall: inputType has dynamic shape\n";
+
+      // First get the number of elements from dynamic sizes
+      memref::SubViewOp subViewOp =
+          dyn_cast<memref::SubViewOp>(input.getDefiningOp());
+      if (!subViewOp) {
+        // llvm::errs() << "SendToAXI4MLIRCall: input is not a subview\n";
+        return failure();
+      }
+      Value nElements =
+          getNumElements(rewriter, loc, subViewOp, inputType, intTy);
+
+      rewriter.create<CallOp>(loc, kDmaStartSend, intTy,
+                              SmallVector<Value, 2>({nElements, initOffset}));
+      rewriter.create<CallOp>(loc, kDmaWaitSend, TypeRange());
+
+      // If many actions are chained, they are placed in order in the DMA,
+      // thus the offset is the size of the previous action.
+      Value resultOffset = nElements;
+      // Value bitWidthV = rewriter.create<arith::ConstantOp>(
+      //     loc, IntegerAttr::get(intTy, bitWidth));
+      // resultOffset =
+      //     rewriter.create<arith::MulIOp>(loc, resultOffset, bitWidthV);
+      // Value eight =
+      //     rewriter.create<arith::ConstantOp>(loc, IntegerAttr::get(intTy,
+      //     8));
+      // resultOffset = rewriter.create<arith::DivSIOp>(loc, resultOffset,
+      // eight);
+      rewriter.replaceOp(op, {resultOffset});
+    }
+
+    return success();
+  }
+};
+
+// Rewrite SendLiteral to a call of kCopyToInbufferI32.
+// This could be optimized to transfer the literal directly to the
+// DMA buffer instead of going through a temporary memref.
+class SendLiteralToAXI4MLIRCall
+    : public OpRewritePattern<accel::SendLiteralOp> {
+public:
+  using OpRewritePattern<accel::SendLiteralOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(accel::SendLiteralOp op,
+                                PatternRewriter &rewriter) const override {
+
+    auto module = SymbolTable::getNearestSymbolTable(op);
+    Location loc = op->getLoc();
+
+    // TODO: Name has to match memref type
+    auto name = kCopyToInbufferI32;
+    Type intTy = rewriter.getI32Type();
+    Value opcode = op.getOpcode();
+
+    // Create a memref and store the opcode in it
+    auto tmpMrTy = MemRefType::get(/*shape*/ {}, rewriter.getIntegerType(32));
+    auto input = rewriter.create<memref::AllocOp>(loc, tmpMrTy);
+    rewriter.create<memref::StoreOp>(loc, opcode, input, ValueRange());
+
+    auto inputType = input.getType().dyn_cast_or_null<MemRefType>();
+    if (!inputType)
+      return failure();
+    auto myType = inputType.getElementType();
+    Type mrTy = UnrankedMemRefType::get(myType, 0);
+
+    fwdDeclareSendFuncs(rewriter, module, intTy, mrTy);
+
+    auto initOffset = op.getOffsetValue();
+    if (!initOffset) {
+      initOffset =
+          rewriter.create<arith::ConstantOp>(loc, IntegerAttr::get(intTy, 0));
+    }
+
+    // Send flow: copy, start, wait
+    Value casted = rewriter.create<memref::CastOp>(loc, mrTy, input);
+    rewriter.create<CallOp>(loc, name, intTy,
+                            SmallVector<Value, 2>({casted, initOffset}));
+
+    int numElements = inputType.getNumElements();
+    int bitWidth = inputType.getElementTypeBitWidth();
+    // int bytes = numElements * bitWidth / 8;
+
+    Value nElements = rewriter.create<arith::ConstantOp>(
+        loc, IntegerAttr::get(intTy, numElements));
+    rewriter.create<CallOp>(loc, kDmaStartSend, intTy,
+                            SmallVector<Value, 2>({nElements, initOffset}));
+    rewriter.create<CallOp>(loc, kDmaWaitSend, TypeRange());
+
+    // Free the temporary memref
+    rewriter.create<memref::DeallocOp>(loc, input);
+
+    Value resultOffset = rewriter.create<arith::ConstantOp>(
+        loc, IntegerAttr::get(intTy, numElements));
+    rewriter.replaceOp(op, {resultOffset});
+
+    return success();
+  }
+};
+
+class RecvToAXI4MLIRCall : public OpRewritePattern<accel::RecvOp> {
+public:
+  using OpRewritePattern<accel::RecvOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(accel::RecvOp op,
+                                PatternRewriter &rewriter) const override {
+
+    auto module = SymbolTable::getNearestSymbolTable(op);
+    Location loc = op->getLoc();
+
+    // TODO: Name has to match memref type
+    auto name = kCopyFromOutbufferI32;
+    auto opFunc = dyn_cast_or_null<SymbolOpInterface>(
+        SymbolTable::lookupSymbolIn(module, name));
+
+    Type intTy = rewriter.getI32Type();
+    Value dst = op.getDst();
+    auto inputType = dst.getType().dyn_cast_or_null<MemRefType>();
+    if (!inputType)
+      return failure();
+    auto myType = inputType.getElementType();
+    Type mrTy = UnrankedMemRefType::get(myType, 0);
+
+    fwdDeclareRecvFuncs(rewriter, module, intTy, mrTy);
+
+    auto initOffset = op.getOffsetValue();
+    if (!initOffset) {
+      initOffset =
+          rewriter.create<arith::ConstantOp>(loc, IntegerAttr::get(intTy, 0));
+    }
+
+    Value casted = rewriter.create<memref::CastOp>(loc, mrTy, dst);
+    int bitWidth = inputType.getElementTypeBitWidth();
+    if (inputType.hasStaticShape()) {
+      // llvm::errs() << "RecvToAXI4MLIRCall: inputType has static shape\n";
+      int numElements = inputType.getNumElements();
+      // int bytes = numElements * bitWidth / 8;
+
+      Value nElements = rewriter.create<arith::ConstantOp>(
+          loc, IntegerAttr::get(intTy, numElements));
+
+      // Recv flow: start, wait, copy
+      rewriter.create<CallOp>(loc, kDmaStartRecv, intTy,
+                              SmallVector<Value, 2>({nElements, initOffset}));
+      rewriter.create<CallOp>(loc, kDmaWaitRecv, TypeRange());
+      rewriter.create<CallOp>(loc, name, intTy,
+                              SmallVector<Value, 2>({casted, initOffset}));
+
+      Value resultOffset = rewriter.create<arith::ConstantOp>(
+          loc, IntegerAttr::get(intTy, numElements));
+      rewriter.replaceOp(op, {resultOffset});
+
+    } else {
+      // llvm::errs() << "RecvToAXI4MLIRCall: inputType has dynamic shape\n";
+
+      // First get the number of elements from dynamic sizes
+      memref::SubViewOp subViewOp =
+          dyn_cast<memref::SubViewOp>(dst.getDefiningOp());
+      if (!subViewOp) {
+        llvm::errs() << "RecvToAXI4MLIRCall: input is not a subview\n";
+        return failure();
+      }
+      Value nElements =
+          getNumElements(rewriter, loc, subViewOp, inputType, intTy);
+
+      rewriter.create<CallOp>(loc, kDmaStartRecv, intTy,
+                              SmallVector<Value, 2>({nElements, initOffset}));
+      rewriter.create<CallOp>(loc, kDmaWaitRecv, TypeRange());
+      rewriter.create<CallOp>(loc, name, intTy,
+                              SmallVector<Value, 2>({casted, initOffset}));
+
+      // If many actions are chained, they are placed in order in the DMA,
+      // thus the offset is the size of the previous action.
+      Value resultOffset = nElements;
+      // Value bitWidthV = rewriter.create<arith::ConstantOp>(
+      //     loc, IntegerAttr::get(intTy, bitWidth));
+      // resultOffset =
+      //     rewriter.create<arith::MulIOp>(loc, resultOffset, bitWidthV);
+      // Value eight =
+      //     rewriter.create<arith::ConstantOp>(loc, IntegerAttr::get(intTy,
+      //     8));
+      // resultOffset = rewriter.create<arith::DivSIOp>(loc, resultOffset,
+      // eight);
+      rewriter.replaceOp(op, {resultOffset});
+    }
+
+    return success();
+  }
+};
+
+void mlir::populateAccelToAXI4MLIRConversionPatterns(
+    RewritePatternSet &patterns) {
+  patterns.add<InitDMAToAXI4MLIRCall>(patterns.getContext());
+  patterns.add<SendToAXI4MLIRCall>(patterns.getContext());
+  patterns.add<SendLiteralToAXI4MLIRCall>(patterns.getContext());
+  patterns.add<RecvToAXI4MLIRCall>(patterns.getContext());
+}
+
+namespace {
+struct ConvertAccelToAXI4MLIRPass
+    : public ConvertAccelToAXI4MLIRBase<ConvertAccelToAXI4MLIRPass> {
+  void runOnOperation() override;
+};
+} // namespace
+
+void ConvertAccelToAXI4MLIRPass::runOnOperation() {
+  auto module = getOperation();
+
+  RewritePatternSet patterns(&getContext());
+  populateAccelToAXI4MLIRConversionPatterns(patterns);
+
+  ConversionTarget target(getContext());
+  // clang-format off
+  target.addLegalDialect<scf::SCFDialect,
+                         memref::MemRefDialect, 
+                         arith::ArithDialect, 
+                         BuiltinDialect,
+                         func::FuncDialect>();
+  // clang-format on
+  target.addIllegalDialect<accel::AccelDialect>();
+  if (failed(applyPartialConversion(module, target, std::move(patterns))))
+    signalPassFailure();
+}
+
+std::unique_ptr<OperationPass<ModuleOp>>
+mlir::createConvertAccelToAXI4MLIRPass() {
+  return std::make_unique<ConvertAccelToAXI4MLIRPass>();
+}
diff --git a/lib/Conversion/AccelToRuntime/CMakeLists.txt b/lib/Conversion/AccelToRuntime/CMakeLists.txt
new file mode 100644
index 0000000..6270ff6
--- /dev/null
+++ b/lib/Conversion/AccelToRuntime/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_mlir_dialect_library(SODAAccelToRuntime
+  AccelToAXI4MLIR.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${PROJ_INCLUDE_DIR}/soda/Conversion/AccelToRuntime
+
+  LINK_LIBS PUBLIC
+  MLIRIR
+  MLIRPass
+)
+ 
\ No newline at end of file
diff --git a/lib/Conversion/CMakeLists.txt b/lib/Conversion/CMakeLists.txt
index b90a4e0..f445062 100644
--- a/lib/Conversion/CMakeLists.txt
+++ b/lib/Conversion/CMakeLists.txt
@@ -1,2 +1,4 @@
 add_subdirectory(KernelsToSODA)
-add_subdirectory(CustomFuncToLLVM)
\ No newline at end of file
+add_subdirectory(CustomFuncToLLVM)
+add_subdirectory(AccelToRuntime)
+add_subdirectory(LinalgToAccel)
diff --git a/lib/Conversion/LinalgToAccel/AXI4MLIRUtils.cpp b/lib/Conversion/LinalgToAccel/AXI4MLIRUtils.cpp
new file mode 100644
index 0000000..3c33397
--- /dev/null
+++ b/lib/Conversion/LinalgToAccel/AXI4MLIRUtils.cpp
@@ -0,0 +1,349 @@
+//===- AXI4MLIRUtils.cpp - Shared functions during conversions --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "soda/Conversion/LinalgToAccel/AXI4MLIRUtils.h"
+
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+
+using namespace mlir;
+using namespace mlir::linalg;
+using namespace mlir::func;
+
+const StringLiteral kLinalgTransformMarker = "__internal_linalg_transform__";
+
+struct LinalgOpChangeFilterPattern
+    : public OpInterfaceRewritePattern<LinalgOp> {
+  LinalgOpChangeFilterPattern(
+      MLIRContext *context,
+      LinalgTransformationFilter f = LinalgTransformationFilter(),
+      PatternBenefit benefit = 1)
+      : OpInterfaceRewritePattern<LinalgOp>(context, benefit),
+        filter(std::move(f)) {}
+
+  LinalgOpChangeFilterPattern(
+      StringRef opName, MLIRContext *context,
+      LinalgTransformationFilter f = LinalgTransformationFilter(),
+      PatternBenefit benefit = 1)
+      : OpInterfaceRewritePattern<LinalgOp>(context, benefit),
+        filter(f.addOpNameFilter(opName)) {}
+
+  LogicalResult matchAndRewrite(LinalgOp op,
+                                PatternRewriter &rewriter) const override {
+    if (failed(filter.checkAndNotify(rewriter, op)))
+      return failure();
+    rewriter.startRootUpdate(op);
+    filter.replaceLinalgTransformationFilter(rewriter, op);
+    rewriter.finalizeRootUpdate(op);
+    return success();
+  }
+
+private:
+  /// LinalgTransformMarker handles special attribute manipulations.
+  LinalgTransformationFilter filter;
+};
+
+static void addTilingPatternToSet(RewritePatternSet &patterns, MLIRContext *ctx,
+                                  const StringRef &srcAttrName,
+                                  const StringRef &dstAttrName,
+                                  const SmallVector<unsigned> &tileSizes) {
+
+  // create SmallVector of int64_t from tileSizes
+  SmallVector<int64_t, 4> tileSizesInt64;
+  for (auto ts : tileSizes) {
+    tileSizesInt64.push_back(ts);
+  }
+  // create a ArrayRef<int64_t> from tileSizes
+  ArrayRef<int64_t> tileSizesRef(tileSizesInt64);
+
+  patterns.add<LinalgTilingPattern>(
+      GenericOp::getOperationName(), ctx,
+      LinalgTilingOptions().setTileSizes(tileSizesRef),
+      LinalgTransformationFilter(StringAttr::get(ctx, srcAttrName),
+                                 StringAttr::get(ctx, dstAttrName)));
+}
+
+static void addTilingPatternToSet(RewritePatternSet &patterns, MLIRContext *ctx,
+                                  const StringRef &srcAttrName,
+                                  const StringRef &dstAttrName,
+                                  const unsigned &tsd0, const unsigned &tsd1,
+                                  const unsigned &tsd2) {
+  addTilingPatternToSet(patterns, ctx, srcAttrName, dstAttrName,
+                        SmallVector<unsigned>{tsd0, tsd1, tsd2});
+}
+
+void mlir::populateCommonLinalgTransformationPatterns(
+    RewritePatternSet &patterns, const AccelTransformationOptions &options) {
+  MLIRContext *ctx = patterns.getContext();
+
+  // Triggers on operations with kLinagTransformMarker set to "GENERALIZE"
+  patterns.add<LinalgGeneralizationPattern>(
+      ctx, LinalgTransformationFilter(StringAttr::get(ctx, "GENERALIZE"),
+                                      StringAttr::get(ctx, "ANNOTATE")));
+
+  // ANNOTATE to INTERCHANGE is performed by custom pattern
+
+  // Perform loop interchange with GenericOpInterchangePattern
+  // This only correctly interchanges loops for GenericOps, thus
+  // generalization must be done prior to this step.
+  if (options.loopPermutation.size() > 0) {
+    patterns.add<GenericOpInterchangePattern>(
+        ctx, options.loopPermutation,
+        LinalgTransformationFilter(StringAttr::get(ctx, "INTERCHANGE"),
+                                   StringAttr::get(ctx, "MEM")));
+  } else {
+    // Simply add a pattern to change the attribute
+    patterns.add<LinalgOpChangeFilterPattern>(
+        GenericOp::getOperationName(), ctx,
+        LinalgTransformationFilter(StringAttr::get(ctx, "INTERCHANGE"),
+                                   StringAttr::get(ctx, "MEM")));
+  }
+
+  // z7020 ARM A9 core specs
+  // L1:  32KB 4-way set-associative (instruction and data caches independent
+  // for each CPU)
+  // L2: 512KB 8-way set-associative (shared between CPUs)
+
+  // Pynq-z2
+  // z7020 chip
+  // 512MB DDR3 with 16-bit bus @ 1050Mbps
+
+  // Pynq-z2
+  // z7020 chip
+  // 512 Mbyte DDR3
+
+  //      M       N       K   ELEMSize   Total bytes    Total KB
+  //  1,024   1,024   1,024      4        12,582,912   12,288.00
+  //    512     512     512      4         3,145,728    3,072.00
+  //    256     256     256      4           786,432      768.00
+  //    128     128     128      4           196,608      192.00
+  //     64      64      64      4            49,152       48.00
+  //     32      32      32      4            12,288       12.00
+  //     16      16      16      4             3,072        3.00
+  //      8       8       8      4               768        0.75
+  //      4       4       4      4               192        0.19
+  //      2       2       2      4                48        0.05
+
+  if (options.tileSizes.size() > 0) {
+    unsigned tileIdx = 0;
+
+    if (options.numberOfCaches == 3) {
+      addTilingPatternToSet(
+          patterns, ctx, "MEM", "L3", options.tileSizes[tileIdx + 0],
+          options.tileSizes[tileIdx + 1], options.tileSizes[tileIdx + 2]);
+      tileIdx += 3;
+
+      addTilingPatternToSet(
+          patterns, ctx, "L3", "L2", options.tileSizes[tileIdx + 0],
+          options.tileSizes[tileIdx + 1], options.tileSizes[tileIdx + 2]);
+      tileIdx += 3;
+
+      addTilingPatternToSet(
+          patterns, ctx, "L2", "L1", options.tileSizes[tileIdx + 0],
+          options.tileSizes[tileIdx + 1], options.tileSizes[tileIdx + 2]);
+      tileIdx += 3;
+    }
+
+    if (options.numberOfCaches == 2) {
+      addTilingPatternToSet(
+          patterns, ctx, "MEM", "L2", options.tileSizes[tileIdx + 0],
+          options.tileSizes[tileIdx + 1], options.tileSizes[tileIdx + 2]);
+      tileIdx += 3;
+
+      addTilingPatternToSet(
+          patterns, ctx, "L2", "L1", options.tileSizes[tileIdx + 0],
+          options.tileSizes[tileIdx + 1], options.tileSizes[tileIdx + 2]);
+      tileIdx += 3;
+    }
+
+    if (options.numberOfCaches == 1) {
+      addTilingPatternToSet(
+          patterns, ctx, "MEM", "L1", options.tileSizes[tileIdx + 0],
+          options.tileSizes[tileIdx + 1], options.tileSizes[tileIdx + 2]);
+      tileIdx += 3;
+    }
+
+  } else {
+    // No tile sizes provided: simply add a pattern to change the attribute
+    patterns.add<LinalgOpChangeFilterPattern>(
+        GenericOp::getOperationName(), ctx,
+        LinalgTransformationFilter(StringAttr::get(ctx, "MEM"),
+                                   StringAttr::get(ctx, "L1")));
+  }
+
+  // At this point relevant operations will have the L1 marker
+  // Only accelerator tiling is missing
+  if (options.accelSizes.size() > 0) {
+    // TODO: Pass in the accel sizes as an ArrayRef
+    assert(options.accelSizes.size() == 3 && "please provide 3 tile sizes");
+
+    patterns.add<LinalgTilingPattern>(
+        GenericOp::getOperationName(), ctx,
+        LinalgTilingOptions().setTileSizes({options.accelSizes[0],
+                                            options.accelSizes[1],
+                                            options.accelSizes[2]}),
+        LinalgTransformationFilter(StringAttr::get(ctx, "L1"),
+                                   StringAttr::get(ctx, "GENACCEL")));
+  } else {
+    if (options.accelSize > 1) {
+      patterns.add<LinalgTilingPattern>(
+          GenericOp::getOperationName(), ctx,
+          LinalgTilingOptions().setTileSizes(
+              {options.accelSize, options.accelSize, options.accelSize}),
+          LinalgTransformationFilter(StringAttr::get(ctx, "L1"),
+                                     StringAttr::get(ctx, "GENACCEL")));
+
+    } else {
+      patterns.add<LinalgTilingPattern>(
+          GenericOp::getOperationName(), ctx,
+          LinalgTilingOptions().setTileSizes({4, 4, 4}),
+          LinalgTransformationFilter(StringAttr::get(ctx, "L1"),
+                                     StringAttr::get(ctx, "GENACCEL")));
+    }
+  }
+}
+
+/// Apply tiling patterns to GenericOps with the correct attribute
+void mlir::applyPatterns(FuncOp funcOp,
+                         const AccelTransformationOptions &options) {
+  MLIRContext *ctx = funcOp.getContext();
+  RewritePatternSet patterns(ctx);
+
+  // Triggers on operations with kLinagTransformMarker set to "GENERALIZE"
+  patterns.add<LinalgGeneralizationPattern>(
+      ctx, LinalgTransformationFilter(StringAttr::get(ctx, "GENERALIZE"),
+                                      StringAttr::get(ctx, "INTERCHANGE")));
+
+  // Perform loop interchange with GenericOpInterchangePattern
+  // This only correctly interchanges loops for GenericOps, thus
+  // generalization must be done prior to this step.
+  if (options.loopPermutation.size() > 0) {
+    patterns.add<GenericOpInterchangePattern>(
+        ctx, options.loopPermutation,
+        LinalgTransformationFilter(StringAttr::get(ctx, "INTERCHANGE"),
+                                   StringAttr::get(ctx, "MEM")));
+  } else {
+    // add pattern to change attribute
+    patterns.add<LinalgOpChangeFilterPattern>(
+        GenericOp::getOperationName(), ctx,
+        LinalgTransformationFilter(StringAttr::get(ctx, "INTERCHANGE"),
+                                   StringAttr::get(ctx, "MEM")));
+  }
+
+  // z7020 ARM A9 core specs
+  // L1:  32KB 4-way set-associative (instruction and data caches independent
+  // for each CPU)
+  // L2: 512KB 8-way set-associative (shared between CPUs)
+
+  // Pynq-z2
+  // z7020 chip
+  // 512MB DDR3 with 16-bit bus @ 1050Mbps
+
+  // Pynq-z2
+  // z7020 chip
+  // 512 Mbyte DDR3
+
+  //      M       N       K   ELEMSize   Total bytes    Total KB
+  //  1,024   1,024   1,024      4        12,582,912   12,288.00
+  //    512     512     512      4         3,145,728    3,072.00
+  //    256     256     256      4           786,432      768.00
+  //    128     128     128      4           196,608      192.00
+  //     64      64      64      4            49,152       48.00
+  //     32      32      32      4            12,288       12.00
+  //     16      16      16      4             3,072        3.00
+  //      8       8       8      4               768        0.75
+  //      4       4       4      4               192        0.19
+  //      2       2       2      4                48        0.05
+
+  if (options.tileSizes.size() > 0) {
+    unsigned tileIdx = 0;
+
+    if (options.numberOfCaches == 3) {
+      addTilingPatternToSet(
+          patterns, ctx, "MEM", "L3", options.tileSizes[tileIdx + 0],
+          options.tileSizes[tileIdx + 1], options.tileSizes[tileIdx + 2]);
+      tileIdx += 3;
+
+      addTilingPatternToSet(
+          patterns, ctx, "L3", "L2", options.tileSizes[tileIdx + 0],
+          options.tileSizes[tileIdx + 1], options.tileSizes[tileIdx + 2]);
+      tileIdx += 3;
+
+      addTilingPatternToSet(
+          patterns, ctx, "L2", "L1", options.tileSizes[tileIdx + 0],
+          options.tileSizes[tileIdx + 1], options.tileSizes[tileIdx + 2]);
+      tileIdx += 3;
+    }
+
+    if (options.numberOfCaches == 2) {
+      addTilingPatternToSet(
+          patterns, ctx, "MEM", "L2", options.tileSizes[tileIdx + 0],
+          options.tileSizes[tileIdx + 1], options.tileSizes[tileIdx + 2]);
+      tileIdx += 3;
+
+      addTilingPatternToSet(
+          patterns, ctx, "L2", "L1", options.tileSizes[tileIdx + 0],
+          options.tileSizes[tileIdx + 1], options.tileSizes[tileIdx + 2]);
+      tileIdx += 3;
+    }
+
+    if (options.numberOfCaches == 1) {
+      addTilingPatternToSet(
+          patterns, ctx, "MEM", "L1", options.tileSizes[tileIdx + 0],
+          options.tileSizes[tileIdx + 1], options.tileSizes[tileIdx + 2]);
+      tileIdx += 3;
+    }
+
+  } else {
+    // If no tile sizes were selected
+    addTilingPatternToSet(patterns, ctx, "MEM", "L1", 4096, 4096, 4096);
+  }
+
+  // At this point relevant operations will have the L1 marker
+  // Only accelerator tiling is missing
+  if (options.accelSize > 1) {
+    patterns.add<LinalgTilingPattern>(
+        GenericOp::getOperationName(), ctx,
+        LinalgTilingOptions().setTileSizes(
+            {options.accelSize, options.accelSize, options.accelSize}),
+        LinalgTransformationFilter(StringAttr::get(ctx, "L1"),
+                                   StringAttr::get(ctx, "GENACCEL")));
+
+  } else {
+    patterns.add<LinalgTilingPattern>(
+        GenericOp::getOperationName(), ctx,
+        LinalgTilingOptions().setTileSizes({4, 4, 4}),
+        LinalgTransformationFilter(StringAttr::get(ctx, "L1"),
+                                   StringAttr::get(ctx, "GENACCEL")));
+  }
+
+  (void)applyPatternsAndFoldGreedily(funcOp, std::move(patterns));
+}
+
+void AccelTransformationOptions::dump() const {
+  llvm::errs() << "accelSize: " << accelSize << "\n"
+               << "dmaAddress\t\t " << dmaAddress << "\n"
+               << "dmaInputAddress\t\t " << dmaInputAddress << "\n"
+               << "dmaInputBufferSize\t " << dmaInputBufferSize << "\n"
+               << "dmaOutputAddress\t " << dmaOutputAddress << "\n"
+               << "dmaOutputBufferSize\t " << dmaOutputBufferSize << "\n"
+               << "flowCpuAcc\t\t " << flowCpuAcc << "\n"
+               << "numberOfCaches\t\t " << numberOfCaches
+               << "\n"
+               // << "cacheSizes\t\t " << cacheSizes << "\n"
+               // << "tileSizes\t\t " << tileSizes << "\n"
+               << "elementSize\t\t " << elementSize
+               << "\n"
+               // << "loopPermutation\t\t " << loopPermutation << "\n"
+               << "anchorFuncName\t\t " << anchorFuncName << "\n"
+               << "anchorOpName\t\t " << anchorOpName << "\n"
+               << "opcodeMap\t\t " << opcodeMap << "\n"
+               << "initFlow\t\t " << initFlow << "\n"
+               << "opcodeFlow\t\t " << opcodeFlow << "\n";
+}
\ No newline at end of file
diff --git a/lib/Conversion/LinalgToAccel/CMakeLists.txt b/lib/Conversion/LinalgToAccel/CMakeLists.txt
new file mode 100644
index 0000000..a92c5e2
--- /dev/null
+++ b/lib/Conversion/LinalgToAccel/CMakeLists.txt
@@ -0,0 +1,19 @@
+add_mlir_conversion_library(SODALinalgToAccel
+  LinalgGenericToAccel.cpp
+  AXI4MLIRUtils.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${PROJ_INCLUDE_DIR}/soda/Conversion/LinalgToAccel
+
+  DEPENDS
+  SODAConversionPassIncGen
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  MLIRArithDialect
+  SODAAccelDialect
+  MLIRMemRefDialect
+  MLIRTransforms
+  )
diff --git a/lib/Conversion/LinalgToAccel/LinalgGenericToAccel.cpp b/lib/Conversion/LinalgToAccel/LinalgGenericToAccel.cpp
new file mode 100644
index 0000000..e83a2b1
--- /dev/null
+++ b/lib/Conversion/LinalgToAccel/LinalgGenericToAccel.cpp
@@ -0,0 +1,1034 @@
+//===- LinalgGenericToAccel.cpp - Generic to accel conversions --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements from linalg generic to accel calls
+//
+//===----------------------------------------------------------------------===//
+
+#include "soda/Conversion/LinalgToAccel/LinalgGenericToAccel.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+
+#include "../PassDetail.h"
+
+#include "soda/Dialect/Accel/IR/Accel.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+
+#include "mlir/IR/OpcodeExpr.h"
+
+#include "mlir/Dialect/Linalg/Passes.h"
+#include "mlir/IR/BuiltinDialect.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Parser.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+using namespace mlir;
+
+const StringLiteral kLinalgTransformMarker = "__internal_linalg_transform__";
+const StringLiteral kAccelTransformMarker = "__accel_transform__";
+const StringLiteral kAccel_dmaAddress = "accel_dmaAddress";
+const StringLiteral kAccel_dmaInputAddress = "accel_dmaInputAddress";
+const StringLiteral kAccel_dmaInputBufferSize = "accel_dmaInputBufferSize";
+const StringLiteral kAccel_dmaOuputAddress = "accel_dmaOutputAddress";
+const StringLiteral kAccel_dmaOuputBufferSize = "accel_dmaOutputBufferSize";
+const StringLiteral kAccel_acc_on_cpu = "accel_acc_on_cpu";
+const StringLiteral kAccel_accumulate_on_cpu = "accel_accumulate_on_cpu";
+const StringLiteral kAccel_opcode_map = "accel_opcode_map";
+const StringLiteral kAccel_opcode_map_str = "accel_opcode_map_str";
+const StringLiteral kAccel_opcode_flow = "accel_opcode_flow";
+const StringLiteral kAccel_opcode_flow_str = "accel_opcode_flow_str";
+const StringLiteral kAccel_loop_permutation = "accel_loop_permutation";
+const StringLiteral kAccel_accel_tile_size = "accel_accel_tile_size";
+const StringLiteral kAccel_accel_tile_sizes = "accel_accel_tile_sizes";
+const StringLiteral kAccel_tile_sizes = "accel_tile_sizes";
+const StringLiteral kAccel_init_flow = "accel_init_flow";
+const StringLiteral kAccel_init_flow_str = "accel_init_flow_str";
+
+IntegerAttr getU32IntegerAttr(PatternRewriter &rewriter, unsigned value) {
+  return rewriter.getIntegerAttr(rewriter.getIntegerType(32, false), value);
+}
+
+/// Remove quotes from string to prevent parser from treating it as string.
+static StringRef prepStringOption(std::string &s, const char delim = '\"') {
+  // NOTE: There is an inconsistent bug with
+  //       StringRef::drop_front(),drop_back(),consume_front(),consume_back()
+  //       It likely does not update the size every time.
+  // NOTE: Input &s must be live after this function call. Passing by copy
+  //       also does not work.
+  // return StringRef(s).consume_front(delim).consume_back(delim);
+
+  if (s[s.length() - 1] == delim)
+    s.erase(s.end() - ((s.length() > 0) ? 1 : 0), s.end());
+  if (s[0] == delim)
+    s.erase(s.begin());
+
+  return StringRef(s);
+}
+
+/// Sets operation Attrs used in generic to accel conversion
+class GenericAttrAnnotationPattern
+    : public OpRewritePattern<linalg::GenericOp> {
+public:
+  using OpRewritePattern<linalg::GenericOp>::OpRewritePattern;
+
+  /// Construct a generic pattern applied to all GenericOp that verify `filter`.
+  /// If attributes are already annotated, skip the replacement.
+  GenericAttrAnnotationPattern(
+      MLIRContext *context,
+      linalg::LinalgTransformationFilter f =
+          linalg::LinalgTransformationFilter(),
+      AccelTransformationOptions options = AccelTransformationOptions(),
+      PatternBenefit benefit = 1)
+      : OpRewritePattern<linalg::GenericOp>(context, benefit), filter(f),
+        options(std::move(options)) {}
+
+  LogicalResult matchAndRewrite(linalg::GenericOp op,
+                                PatternRewriter &rewriter) const override {
+    return returningMatchAndRewrite(op, rewriter);
+  }
+
+  /// Check if the attribute attrName is already set, if not, use a lambda
+  /// function to set it.
+  template <typename Func>
+  static void setAttrIfEmpty(Operation *op, StringRef attrName, Func lambda) {
+    if (!op->getAttr(attrName)) {
+      lambda();
+    }
+  }
+
+  LogicalResult returningMatchAndRewrite(linalg::GenericOp op,
+                                         PatternRewriter &rewriter) const {
+    if (failed(filter.checkAndNotify(rewriter, op)))
+      return failure();
+    rewriter.startRootUpdate(op);
+
+    // DMA Attributes
+    setAttrIfEmpty(op, kAccel_dmaAddress, [&]() {
+      op->setAttr(kAccel_dmaAddress,
+                  rewriter.getI32IntegerAttr(options.dmaAddress));
+    });
+    setAttrIfEmpty(op, kAccel_dmaInputAddress, [&]() {
+      op->setAttr(kAccel_dmaInputAddress,
+                  rewriter.getI32IntegerAttr(options.dmaInputAddress));
+    });
+    setAttrIfEmpty(op, kAccel_dmaInputBufferSize, [&]() {
+      op->setAttr(kAccel_dmaInputBufferSize,
+                  rewriter.getI32IntegerAttr(options.dmaInputBufferSize));
+    });
+    setAttrIfEmpty(op, kAccel_dmaOuputAddress, [&]() {
+      op->setAttr(kAccel_dmaOuputAddress,
+                  rewriter.getI32IntegerAttr(options.dmaOutputAddress));
+    });
+    setAttrIfEmpty(op, kAccel_dmaOuputBufferSize, [&]() {
+      op->setAttr(kAccel_dmaOuputBufferSize,
+                  rewriter.getI32IntegerAttr(options.dmaOutputBufferSize));
+    });
+    setAttrIfEmpty(op, kAccel_acc_on_cpu, [&]() {
+      op->setAttr(kAccel_acc_on_cpu, rewriter.getBoolAttr(options.flowCpuAcc));
+    });
+
+    // OpcodeMap Attribute
+    // as string
+    std::string s0 = options.opcodeMap;
+    StringRef opcodeMapStr = prepStringOption(s0);
+    if (opcodeMapStr == "" && !op->getAttr(kAccel_opcode_map_str)) {
+      op->emitWarning("No opcode map attribute found, skipping");
+      filter.replaceLinalgTransformationFilter(rewriter, op);
+      rewriter.finalizeRootUpdate(op);
+      return success();
+    }
+    setAttrIfEmpty(op, kAccel_opcode_map_str, [&]() {
+      op->setAttr(kAccel_opcode_map_str, rewriter.getStringAttr(opcodeMapStr));
+    });
+    // as attribute
+    setAttrIfEmpty(op, kAccel_opcode_map, [&]() {
+      OpcodeMapAttr opcodeMapAttr =
+          parseAttribute(
+              op->getAttrOfType<StringAttr>(kAccel_opcode_map_str).getValue(),
+              rewriter.getContext())
+              .dyn_cast<OpcodeMapAttr>();
+      op->setAttr(kAccel_opcode_map, opcodeMapAttr);
+    });
+
+    // OpcodeFlow Attribute
+    // as string
+    std::string s1 = options.opcodeFlow;
+    StringRef opcodeFlowStr = prepStringOption(s1);
+    setAttrIfEmpty(op, kAccel_opcode_flow_str, [&]() {
+      op->setAttr(kAccel_opcode_flow_str,
+                  rewriter.getStringAttr(opcodeFlowStr));
+    });
+    // as attribute
+    // TODO: handle kAccel_opcode_flow, parse string to validate identifiers
+
+    // InitFlow Attribute
+    // as string
+    std::string s2 = options.initFlow;
+    StringRef initFlowStr = prepStringOption(s2);
+    setAttrIfEmpty(op, kAccel_init_flow_str, [&]() {
+      op->setAttr(kAccel_init_flow_str, rewriter.getStringAttr(initFlowStr));
+    });
+    // as attribute
+    // TODO: handle kAccel_init_flow, parse string to validate identifiers
+
+    // Create a lambda function for ArrayRef<unsigned> options
+    auto getArrayAttr = [&](const ArrayRef<unsigned> &inArray) -> ArrayAttr {
+      SmallVector<Attribute> tmpArray;
+      for (auto v : inArray)
+        tmpArray.push_back(rewriter.getI32IntegerAttr(v));
+      return rewriter.getArrayAttr(tmpArray);
+    };
+
+    // Attributes for tilling and permutation
+    // TODO: currently the attribute is set correctly but the rewriter pass uses
+    // what is inside the command line options
+
+    // LoopPermutation Attribute
+    setAttrIfEmpty(op, kAccel_loop_permutation, [&]() {
+      op->setAttr(kAccel_loop_permutation,
+                  getArrayAttr(options.loopPermutation));
+    });
+
+    // AccelSizes Attribute
+    setAttrIfEmpty(op, kAccel_accel_tile_sizes, [&]() {
+      op->setAttr(kAccel_accel_tile_sizes, getArrayAttr(options.accelSizes));
+    });
+
+    // LoopTiling Attribute
+    setAttrIfEmpty(op, kAccel_tile_sizes, [&]() {
+      op->setAttr(kAccel_tile_sizes, getArrayAttr(options.tileSizes));
+    });
+
+    // Accelerator Tile Size Attribute
+    setAttrIfEmpty(op, kAccel_accel_tile_size, [&]() {
+      op->setAttr(kAccel_accel_tile_size,
+                  rewriter.getI32IntegerAttr(options.accelSize));
+    });
+
+    // List of operand ids to accumulate on cpu
+    setAttrIfEmpty(op, kAccel_accumulate_on_cpu, [&]() {
+      op->setAttr(kAccel_accumulate_on_cpu, getArrayAttr(options.accOnCpu));
+    });
+
+    filter.replaceLinalgTransformationFilter(rewriter, op);
+    rewriter.finalizeRootUpdate(op);
+    return success();
+  }
+
+private:
+  /// LinalgTransformMarker handles special attribute manipulations.
+  linalg::LinalgTransformationFilter filter;
+  /// Options for accel transformation
+  AccelTransformationOptions options;
+};
+
+/// Function to materialize DMA attributes as constants
+static void materializeDMAConstants(PatternRewriter &rewriter, Operation *op,
+                                    Location loc,
+                                    SmallVector<Value, 5> &values) {
+  values.push_back(rewriter.create<arith::ConstantOp>(
+      loc, op->getAttrOfType<IntegerAttr>(kAccel_dmaAddress)));
+  values.push_back(rewriter.create<arith::ConstantOp>(
+      loc, op->getAttrOfType<IntegerAttr>(kAccel_dmaInputAddress)));
+  values.push_back(rewriter.create<arith::ConstantOp>(
+      loc, op->getAttrOfType<IntegerAttr>(kAccel_dmaInputBufferSize)));
+  values.push_back(rewriter.create<arith::ConstantOp>(
+      loc, op->getAttrOfType<IntegerAttr>(kAccel_dmaOuputAddress)));
+  values.push_back(rewriter.create<arith::ConstantOp>(
+      loc, op->getAttrOfType<IntegerAttr>(kAccel_dmaOuputBufferSize)));
+}
+
+/// Rewrites GenericOp as a series of of accel.<operations>
+/// Expects the correct attributes to be already set as it
+/// does not use options flags and instead, reads the op attributes.
+/// TODO: Let this be the case for accelerators with no OPCODES
+class LinalgGenericToAccel : public OpRewritePattern<linalg::GenericOp> {
+public:
+  using OpRewritePattern<linalg::GenericOp>::OpRewritePattern;
+
+  // Create a function that depending on an integer, adds a value to the
+  // correct loop body in a nested loop structure.
+  // ex: if loop_offset = 0,
+  //        then add to the innermost loop body, before `op`
+  //     if loop_offset = 1,
+  //        then add to the second innermost loop body, before terminator the
+  //        `op`
+  //     if loop_offset = 2,
+  //        then add to the third innermost loop body, after the `op`
+  //     if loop_offset = -1,
+  //        then add to the second innermost loop body, before `op`
+  //
+  template <typename Func>
+  static void addOperationToLoopBody(PatternRewriter &rewriter, Location loc,
+                                     Operation *op, int loop_offset,
+                                     Func lambda) {
+
+    // if loop_offset = 0, then add to the innermost loop body
+    if (loop_offset == 0) {
+      // Set insertion point before the operation
+      // op->emitWarning() << "Offset is 0, calling lambda";
+      rewriter.setInsertionPoint(op);
+      lambda();
+      return;
+    }
+
+    // Get the parent loop operation
+    scf::ForOp parent_loop_op = op->getParentOfType<scf::ForOp>();
+    assert(
+        parent_loop_op &&
+        "Accessing parent scf::ForOp, but a parent scf::ForOp was not found.");
+
+    switch (loop_offset) {
+    case -1: {
+      // op->emitWarning() << "Offset is -1, calling lambda";
+      if (parent_loop_op) {
+        // Set insertion point right before the scf::ForOp
+        rewriter.setInsertionPoint(parent_loop_op);
+      }
+      lambda();
+      break;
+    }
+    case 1: {
+      if (parent_loop_op) {
+        // op->emitWarning() << "Offset is 1, calling lambda";
+        // Set insertion point before the terminator of parent loop operation
+        rewriter.setInsertionPoint(parent_loop_op->getBlock()->getTerminator());
+      }
+      lambda();
+      break;
+    }
+    default: {
+      // if not -1, 0, 1, we have to recursively call this function with parent
+      // loop operation as the operation and loop_offset -1 if positive, or +1
+      // if negative
+      addOperationToLoopBody(
+          rewriter, loc, parent_loop_op,
+          loop_offset > 0 ? loop_offset - 1 : loop_offset + 1, lambda);
+    }
+    }
+    return;
+  }
+
+  // Function to parse accel_opcode_flow_str and generate a vector of where each
+  // operation should be placed
+  // The attribute opcode flow string has the following format:
+  // opcode_flow ::=  opcode_flow_expr
+  // opcode_flow_expr ::= `(` opcode_flow_expr `)`
+  //                    | `(` opcode_flow_expr opcode_id `)`
+  //                    | `(` opcode_id `opcode_flow_expr )`
+  //                    | opcode_id
+  //
+
+  // Examples and outputs:
+  // accel_opcode_flow_str = "(s0 (s1 s2 r2))"
+  // [(-1,[s0]), (0,[s1,s2,r2])]
+  //
+  // accel_opcode_flow_str = "(s0 (s1 s2) r2)"
+  // [(-1,[s0]), (0,[s1,s2]), (1,[r2])]
+  //
+  // accel_opcode_flow_str = "((s0 s1 s2) r2)"
+  // [(0,[s0,s1,s2 ]), (1,[r2])]
+  static LogicalResult parseOpcodeFlowStr(
+      Operation *op, SmallVectorImpl<int> &loop_offsets,
+      SmallVectorImpl<std::string> &opcodes_strs,
+      SmallVectorImpl<SmallVector<StringRef, 3>> &lists_of_opcode_ids) {
+    // op->emitWarning() << "Parsing opcode flow str";
+    std::string opcode_flow_str =
+        op->getAttrOfType<StringAttr>(kAccel_opcode_flow_str).str();
+    // op->emitWarning() << opcode_flow_str;
+
+    assert(!opcode_flow_str.empty() &&
+           "accel_opcode_flow_str is empty, but it should not be.");
+
+    int n_left_paren = 0;
+    int n_right_paren = 0;
+    for (char c : opcode_flow_str) {
+      if (c == '(')
+        n_left_paren++;
+      if (c == ')')
+        n_right_paren++;
+    }
+    assert(n_left_paren == n_right_paren &&
+           "accel_opcode_flow_str has mismatched parentheses");
+
+    // get substring between parentheses
+    int c_paren = 0;
+    for (size_t i = 0; i < opcode_flow_str.size(); i++) {
+      if (opcode_flow_str[i] == '(' || opcode_flow_str[i] == ')') {
+        size_t j = i + 1;
+        while ((opcode_flow_str[j] != ')') && (opcode_flow_str[j] != '(')) {
+          j++;
+        }
+        if (opcode_flow_str[i] == '(' || opcode_flow_str[i] == ')') {
+          c_paren++;
+
+          // Only print if still inside parentheses
+          if (c_paren < n_left_paren + n_right_paren) {
+            // if (j != opcode_flow_str.size()) {
+            std::string substring = opcode_flow_str.substr(i + 1, j - i - 1);
+
+            // Only push back if the substring is not empty
+            if (!substring.empty()) {
+              loop_offsets.push_back(c_paren - n_left_paren);
+              opcodes_strs.push_back(substring);
+              // op->emitWarning() << substring << " " << c_paren -
+              // n_left_paren;
+            }
+          }
+        }
+      }
+    }
+
+    // The strings in opcodes_strs represent a string of multiple opcodes
+    // separated by spaces. We need to split them into individual opcodes.
+    for (auto &&opcode_str : opcodes_strs) {
+      SmallVector<StringRef, 3> splitted_opcodes;
+      StringRef opcode_id_sr = opcode_str;
+      // First trim leading and trailing spaces
+      opcode_id_sr = opcode_id_sr.trim();
+      // Finally split the string into individual opcodes
+      opcode_id_sr.split(splitted_opcodes, " ");
+
+      // push back the vector of opcode ids
+      lists_of_opcode_ids.push_back(splitted_opcodes);
+
+      // print the opcodes
+      // for (auto &&opcode_id_split : splitted_opcodes) {
+      //   op->emitWarning() << "Opcode id: " << opcode_id_split<< "!";
+      // }
+    }
+
+    assert(loop_offsets.size() == lists_of_opcode_ids.size() &&
+           "loop_offsets and lists_of_opcode_ids have different sizes");
+
+    // Print the loop offsets and opcode ids
+    // op->emitWarning() << "Opcode flow str parsed successfully!"
+    //                   << "\n\tloop_offsets: " << loop_offsets
+    //                   << "\n\topcodes_strs: " << opcodes_strs
+    //                   << "\n\tlists_of_opcode_ids_size: " <<
+    //                   lists_of_opcode_ids.size()
+    //                   << "\n\tlists_of_opcode_ids: " << lists_of_opcode_ids;
+
+    return success();
+  }
+
+  static void printOpcodesInMap(
+      Operation *op, SmallVectorImpl<int> &loop_offsets,
+      SmallVectorImpl<std::string> &opcodes_strs,
+      SmallVectorImpl<SmallVector<StringRef, 3>> &lists_of_opcode_ids) {
+
+    // Get the opcodeMap from operation
+    auto opcodeMap =
+        op->getAttrOfType<OpcodeMapAttr>(kAccel_opcode_map).getValue();
+    llvm::errs() << "OpcodeMap: " << opcodeMap << "\n";
+    op->emitWarning() << "Number of opcodes in the map: "
+                      << opcodeMap.getNumOpcodes() << "!";
+
+    // Print value associated with opcode in the opcodeMap attribute
+    // Use OpcodeList OpcodeMap::getOpcodeList(StringRef key)
+    for (auto &&list_of_opcode_ids : lists_of_opcode_ids) {
+      for (auto &&opcode_id : list_of_opcode_ids) {
+        // Print id and position of opcode in the map
+        op->emitWarning() << "Opcode id: " << opcode_id << " at position "
+                          << opcodeMap.getOpcodeListPosition(opcode_id) << "!";
+        assert(opcodeMap.getOpcodeListPosition(opcode_id) != -1 &&
+               "Opcode id not found in the map!");
+        OpcodeList opcodeList = opcodeMap.getOpcodeList(opcode_id);
+        // Print number of opcodes in the list
+        op->emitWarning() << "Number of opcodes in the list: "
+                          << opcodeList.getNumActions() << "!";
+        // Print id and dump of each opcode in the list
+        llvm::errs() << "Opcode id: " << opcode_id << " "
+                     << "OpcodeListDump: " << opcodeList << "\n";
+
+        for (auto &&action : opcodeList.getActions()) {
+          // Switch case on the kind of action
+          switch (action.getKind()) {
+          case OpcodeExprKind::Send: {
+            auto id = action.cast<OpcodeSendIdExpr>().getId();
+            llvm::errs() << "Send action. "
+                         << "id: " << id << "\n";
+            break;
+          }
+          case OpcodeExprKind::Recv: {
+            llvm::errs() << "Recv action. ";
+            break;
+          }
+          case OpcodeExprKind::SendLiteral: {
+            llvm::errs() << "SendLiteral action. ";
+            break;
+          }
+          case OpcodeExprKind::SendDim: {
+            llvm::errs() << "SendDim action. ";
+            break;
+          }
+          case OpcodeExprKind::SendIdx: {
+            llvm::errs() << "SendIdx action. ";
+            break;
+          }
+          default: {
+            llvm_unreachable("Unknown action.");
+          }
+          }
+          llvm::errs() << "action dump: " << action << "\n";
+        }
+      }
+    }
+  }
+
+  /// Add accel.send and accel.recv operations to the function based on the
+  /// loop_offsets and lists_of_opcode_ids paired with the opcodeMap attribute.
+  static void
+  addAccelOps(Operation *op, PatternRewriter &rewriter,
+              SmallVectorImpl<int> &loop_offsets,
+              SmallVectorImpl<SmallVector<StringRef, 3>> &lists_of_opcode_ids) {
+
+    Location loc = op->getLoc();
+    // op->emitWarning() << "Adding accel.send and accel.recv operations...";
+    auto opcodeMap =
+        op->getAttrOfType<OpcodeMapAttr>(kAccel_opcode_map).getValue();
+    // llvm::errs() << "OpcodeMap: " << opcodeMap << "\n";
+    // op->emitWarning() << "Number of opcodes in the map: "
+    //                   << opcodeMap.getNumOpcodes() << "!";
+
+    std::vector<std::pair<int, SmallVector<StringRef, 3>>> zipped;
+    std::transform(loop_offsets.begin(), loop_offsets.end(),
+                   lists_of_opcode_ids.begin(), std::back_inserter(zipped),
+                   [](int a, SmallVector<StringRef, 3> b) {
+                     return std::make_pair(a, b);
+                   });
+
+    for (auto &&pair : zipped) {
+      int loop_offset = pair.first;
+      SmallVector<StringRef, 3> list_of_opcode_ids = pair.second;
+      for (auto &&opcode_id : list_of_opcode_ids) {
+        // Print id and position of opcode in the map
+        // op->emitWarning() << "Opcode id: " << opcode_id
+        //                   << " in map at position "
+        //                   << opcodeMap.getOpcodeListPosition(opcode_id) <<
+        //                   "!";
+        assert(opcodeMap.getOpcodeListPosition(opcode_id) != -1 &&
+               "Opcode id not found in the map!");
+        OpcodeList opcodeList = opcodeMap.getOpcodeList(opcode_id);
+        // Print number of opcodes in the list
+        // op->emitWarning() << "Number of opcodes in the list: "
+        //                   << opcodeList.getNumActions() << "!";
+        // Print id and dump of each opcode in the list
+        // llvm::errs() << "Opcode id: " << opcode_id << " "
+        //              << "OpcodeListDump: " << opcodeList << "\n";
+
+        Value initialOffset = nullptr;
+
+        addOperationToLoopBody(rewriter, op->getLoc(), op, loop_offset, [&]() {
+          // Create the value to track the offset of the data
+          Value cteZero = rewriter.create<arith::ConstantOp>(
+              loc, IntegerAttr::get(rewriter.getI32Type(), 0));
+          initialOffset = cteZero;
+        });
+
+        // Insert the actions in the IR
+        for (auto &&action : opcodeList.getActions()) {
+          // Switch case on the kind of action
+          switch (action.getKind()) {
+          case OpcodeExprKind::Send: {
+            auto id = action.cast<OpcodeSendIdExpr>().getId();
+
+            Value operand = op->getOperands()[id];
+            addOperationToLoopBody(
+                rewriter, op->getLoc(), op, loop_offset, [&]() {
+                  // Operand is a subview of the original memref, we need to
+                  // move this subview to correct loop_offset. We do this by
+                  // creating a new memref.subview with the same input
+                  // parameters. And replacing the operand with this new
+                  // subview.
+                  auto subViewOp = operand.getDefiningOp<memref::SubViewOp>();
+                  if (!subViewOp) {
+                    // Simply create a send operation with the operand
+                    initialOffset = rewriter.create<accel::SendOp>(
+                        loc, rewriter.getI32Type(), operand, initialOffset);
+                    return;
+                  }
+
+                  // // TODO: Check if subview has been replaced
+                  // // Only create the replacement if the subview has not been
+                  // // moved yet. To verify this, check if the parent of the
+                  // // subview is the same as the parent of op.
+                  // if (subViewOp->getParentOp() == op->getParentOp()) {
+                  //   op->emitWarning() << "Subview has already been moved!";
+                  //   initialOffset = rewriter.create<accel::SendOp>(
+                  //       loc, rewriter.getI32Type(), subViewOp,
+                  //       initialOffset);
+                  // } else {
+                  //   op->emitError() << "Subview has not been moved yet!";
+                  //   return;
+                  // }
+
+                  // Value newSubView = rewriter.create<memref::SubViewOp>(
+                  //     loc, subViewOp.getType(), subViewOp.source(),
+                  //     subViewOp.static_offsets(), subViewOp.static_sizes(),
+                  //     subViewOp.static_strides());
+                  Value newSubView = rewriter.create<memref::SubViewOp>(
+                      loc, subViewOp.getType(), subViewOp.source(),
+                      subViewOp.offsets(), subViewOp.sizes(),
+                      subViewOp.strides(), subViewOp.static_offsets(),
+                      subViewOp.static_sizes(), subViewOp.static_strides());
+
+                  // Iterate on the operands, get defining op, if it is a
+                  // constantop then move it before the newsubview
+                  for (auto &&operand : subViewOp.getOperands()) {
+                    Operation *defOp = operand.getDefiningOp();
+                    if (defOp && isa<arith::ConstantOp>(defOp)) {
+                      defOp->moveBefore(newSubView.getDefiningOp());
+                    }
+                  }
+                  rewriter.replaceOp(subViewOp, newSubView);
+
+                  initialOffset = rewriter.create<accel::SendOp>(
+                      loc, rewriter.getI32Type(), newSubView, initialOffset);
+                });
+            break;
+          }
+          case OpcodeExprKind::Recv: {
+            auto id = action.cast<OpcodeRecvIdExpr>().getId();
+
+            Value operand = op->getOperands()[id];
+            addOperationToLoopBody(
+                rewriter, op->getLoc(), op, loop_offset, [&]() {
+                  auto subViewOp = operand.getDefiningOp<memref::SubViewOp>();
+                  if (!subViewOp) {
+                    // Simply create a Recv operation with the operand
+                    initialOffset = rewriter.create<accel::RecvOp>(
+                        loc, rewriter.getI32Type(), operand, initialOffset);
+                    return;
+                  }
+
+                  // TODO: Check if subview has been replaced
+
+                  Value newSubView = rewriter.create<memref::SubViewOp>(
+                      loc, subViewOp.getType(), subViewOp.source(),
+                      subViewOp.offsets(), subViewOp.sizes(),
+                      subViewOp.strides(), subViewOp.static_offsets(),
+                      subViewOp.static_sizes(), subViewOp.static_strides());
+
+                  for (auto &&operand : subViewOp.getOperands()) {
+                    Operation *defOp = operand.getDefiningOp();
+                    if (defOp && isa<arith::ConstantOp>(defOp)) {
+                      defOp->moveBefore(newSubView.getDefiningOp());
+                    }
+                  }
+                  rewriter.replaceOp(subViewOp, newSubView);
+
+                  // Generate accumulation on CPU if needed.
+                  bool acc_on_cpu = false;
+                  if (op->getAttrOfType<BoolAttr>(kAccel_acc_on_cpu).getValue())
+                    acc_on_cpu = true;
+                  else {
+                    // Set acc_on_cpu true if the operand is in the list of
+                    // operands to be accumulated.
+                    for (auto &&operand : op->getAttrOfType<ArrayAttr>(
+                             kAccel_accumulate_on_cpu)) {
+                      if (operand.cast<IntegerAttr>().getInt() == id) {
+                        acc_on_cpu = true;
+                        break;
+                      }
+                    }
+                  }
+                  if (acc_on_cpu) {
+                    MemRefType sVmrType =
+                        newSubView.getType().cast<MemRefType>();
+
+                    SmallVector<int64_t, 2> shape;
+                    auto accelSizes = op->getAttrOfType<ArrayAttr>(
+                        kAccel_accel_tile_sizes);
+
+                    // TODO: get shape from SubViewOp creating the subview
+                    auto loopPerm = op->getAttrOfType<ArrayAttr>(
+                                          kAccel_loop_permutation);
+                    int index[3];
+                    for (unsigned i = 0; i < 3; i++) {
+                      index[loopPerm[i].cast<IntegerAttr>().getInt()]=i;
+                    }   
+                      // SmallVector<int64_t> rootTileSizes(options.tileSizes.begin(),
+                      //                options.tileSizes.begin() +
+                      //                    rootOp.getNumLoops());
+                    // if access sizes bigger than 0, use them
+                    if (accelSizes.size() > 0) {
+                      // TODO use begin and end iterator
+                      for (unsigned i = 0; i < sVmrType.getRank(); i++) {
+                        shape.push_back(accelSizes[index[i]].cast<IntegerAttr>().getInt());
+                      }
+                    } else {
+                      for (unsigned i = 0; i < sVmrType.getRank(); i++) {
+                        auto accelSize = op->getAttrOfType<IntegerAttr>(
+                            kAccel_accel_tile_size);
+
+                        // TODO: Support multi-dimensions
+                        shape.push_back(accelSize.getInt());
+                      }
+                    }
+                    // Transform SmallVector in ArrayRef
+                    ArrayRef<int64_t> shapeRef(shape);
+                    MemRefType mrType =
+                        MemRefType::get(shapeRef, sVmrType.getElementType());
+                    Value tMr = rewriter.create<memref::AllocaOp>(loc, mrType);
+                    initialOffset = rewriter.create<accel::RecvOp>(
+                        loc, rewriter.getI32Type(), tMr, initialOffset);
+
+                    // Create affine maps and attributes for CPU accumulation
+                    MemRefType tmpMrType = tMr.getType().cast<MemRefType>();
+                    unsigned rank = tmpMrType.getRank();
+                    SmallVector<AffineMap, 3> indexingMaps(
+                        /*1 inputs, 1 (inplace) output*/ 2,
+                        rewriter.getMultiDimIdentityMap(rank));
+                    auto loopsAttr = SmallVector<StringRef>(
+                        rank, getParallelIteratorTypeName());
+
+                    rewriter.create<linalg::GenericOp>(
+                        loc,
+                        /*resultTypes=*/TypeRange(),
+                        /*inputs=*/tMr,
+                        /*outputs=*/newSubView,
+                        /*indexingMaps=*/indexingMaps,
+                        /*iteratorTypes=*/loopsAttr,
+                        /*bodyBuilder=*/
+                        [&](OpBuilder &nestedBuilder, Location nestedLoc,
+                            ValueRange args) {
+                          Value added = nestedBuilder.create<arith::AddIOp>(
+                              loc, args[0], args[1]);
+                          nestedBuilder.create<linalg::YieldOp>(nestedLoc,
+                                                                added);
+                        });
+                  } else {
+                    //     initialOffset = rewriter.create<accel::RecvOp>(
+                    //         loc, rewriter.getI32Type(), operand,
+                    //         initialOffset);
+                    initialOffset = rewriter.create<accel::RecvOp>(
+                        loc, rewriter.getI32Type(), newSubView, initialOffset);
+                  }
+                });
+            break;
+          }
+          case OpcodeExprKind::SendLiteral: {
+            auto value = action.cast<OpcodeSendLiteralExpr>().getValue();
+
+            Value literal = rewriter.create<arith::ConstantOp>(
+                loc, IntegerAttr::get(rewriter.getI32Type(), value));
+            addOperationToLoopBody(
+                rewriter, op->getLoc(), op, loop_offset, [&]() {
+                  initialOffset = rewriter.create<accel::SendLiteralOp>(
+                      loc, rewriter.getI32Type(), literal, initialOffset);
+                });
+            break;
+          }
+          case OpcodeExprKind::SendDim: {
+            llvm::errs() << "SendDim action. ";
+            llvm_unreachable("No support for SendDim yet.");
+            break;
+          }
+          case OpcodeExprKind::SendIdx: {
+            llvm::errs() << "No support for SendIdx yet. ";
+            break;
+          }
+          default:
+            llvm_unreachable("Unknown action.");
+          }
+        }
+      }
+    }
+  }
+
+  LogicalResult matchAndRewrite(linalg::GenericOp op,
+                                PatternRewriter &rewriter) const override {
+
+    Location loc = op->getLoc();
+
+    // Get location before first operation inside funcOp
+    FuncOp funcOp = op->getParentOfType<FuncOp>();
+    // Location funcFrontLoc = funcOp.front().front().getLoc();
+
+    rewriter.setInsertionPointToStart(&funcOp.front());
+    Location funcFrontLoc = rewriter.getInsertionPoint()->getLoc();
+
+    SmallVector<Value, 5> valuesForInitDMA;
+    materializeDMAConstants(rewriter, op, funcFrontLoc, valuesForInitDMA);
+
+    // TODO check if such operation already exists for the same DMA address
+    // Create the accel.init_dma operation
+    rewriter.create<accel::InitDMAOp>(funcFrontLoc, valuesForInitDMA[0],
+                                      valuesForInitDMA[1], valuesForInitDMA[2],
+                                      valuesForInitDMA[3], valuesForInitDMA[4]);
+
+    SmallVector<int, 5> loop_offsets;
+    SmallVector<std::string, 4> opcodes_strs;
+    SmallVector<SmallVector<StringRef, 3>, 4> lists_of_opcode_ids;
+    parseOpcodeFlowStr(op, loop_offsets, opcodes_strs, lists_of_opcode_ids);
+
+    // printOpcodesInMap(op, loop_offsets, opcodes_strs, lists_of_opcode_ids);
+    addAccelOps(op, rewriter, loop_offsets, lists_of_opcode_ids);
+
+    // for (auto && l: loop_offsets) {
+    //   addOperationToLoopBody(rewriter, loc, op, l, [&]() {
+    //     op->emitWarning() << "Creating testCte";
+    //     // TODO: Create correct accel operation
+    //     Value testCte = rewriter.create<arith::ConstantOp>(
+    //         loc, IntegerAttr::get(rewriter.getI32Type(), 7777+l));
+    //   });
+    // }
+
+    // rewriter.setInsertionPoint(op);
+
+    // Value cteZero = rewriter.create<arith::ConstantOp>(
+    //     loc, IntegerAttr::get(rewriter.getI32Type(), 0));
+    // Value initialOffset = cteZero;
+
+    // for (Value operand : op.inputs()) {
+    //   initialOffset = rewriter.create<accel::SendOp>(loc,
+    //   rewriter.getI32Type(),
+    //                                                  operand, initialOffset);
+    // }
+
+    // initialOffset = cteZero;
+    // for (Value operand : op.outputs()) {
+    //   if (op->getAttrOfType<BoolAttr>(kAccel_acc_on_cpu).getValue()) {
+    //     MemRefType mrType = operand.getType().cast<MemRefType>();
+    //     Value tMr = rewriter.create<memref::AllocaOp>(loc, mrType);
+    //     rewriter.create<accel::RecvOp>(
+    //         loc, rewriter.getI32Type(), tMr,
+    //         initialOffset); // TODO: Initial offset? Multiple outputs?
+
+    //     // Create affine maps and attributes for CPU accumulation
+    //     MemRefType tmpMrType = tMr.getType().cast<MemRefType>();
+    //     unsigned rank = tmpMrType.getRank();
+    //     SmallVector<AffineMap, 3> indexingMaps(
+    //         /*1 inputs, 1 (inplace) output*/ 2,
+    //         rewriter.getMultiDimIdentityMap(rank));
+    //     auto loopsAttr =
+    //         SmallVector<StringRef>(rank, getParallelIteratorTypeName());
+
+    //     rewriter.create<linalg::GenericOp>(
+    //         loc,
+    //         /*resultTypes=*/TypeRange(),
+    //         /*inputs=*/tMr,
+    //         /*outputs=*/operand,
+    //         /*indexingMaps=*/indexingMaps,
+    //         /*iteratorTypes=*/loopsAttr,
+    //         /*bodyBuilder=*/
+    //         [&](OpBuilder &nestedBuilder, Location nestedLoc, ValueRange
+    //         args) {
+    //           Value added =
+    //               nestedBuilder.create<arith::AddIOp>(loc, args[0], args[1]);
+    //           nestedBuilder.create<linalg::YieldOp>(nestedLoc, added);
+    //         });
+    //   } else {
+    //     initialOffset = rewriter.create<accel::RecvOp>(
+    //         loc, rewriter.getI32Type(), operand, initialOffset);
+    //   }
+    // }
+    rewriter.eraseOp(op);
+
+    return success();
+  }
+};
+
+void mlir::populateLinalgGenericToAccelConversionPatternsWithOptions(
+    RewritePatternSet &patterns, const AccelTransformationOptions &options) {
+  MLIRContext *ctx = patterns.getContext();
+  // This populate patterns that implement the following FSM modifying
+  // kLinalgTransformMarker GENERALIZE -> ANNOTATE -> INTERCHANGE -> MEM(TILE)
+  // L3(TILE) -> L2(TILE) -> L1(TILE) -> ACCEL
+  patterns.add<GenericAttrAnnotationPattern>(
+      ctx,
+      linalg::LinalgTransformationFilter(StringAttr::get(ctx, "ANNOTATE"),
+                                         StringAttr::get(ctx, "INTERCHANGE")),
+      options);
+  populateCommonLinalgTransformationPatterns(patterns, options);
+}
+
+void mlir::populateLinalgGenericToAccelConversionPatterns(
+    RewritePatternSet &patterns) {
+  patterns.add<LinalgGenericToAccel>(patterns.getContext());
+}
+
+namespace {
+struct ConvertLinalgGenericToAccelPass
+    : public ConvertLinalgGenericToAccelBase<ConvertLinalgGenericToAccelPass> {
+  ConvertLinalgGenericToAccelPass() = default;
+
+  /// Constructor to build this pass using user defined options
+  /// Not used when the pass is created from commandline, helpful for creating
+  /// this pass in code
+  ConvertLinalgGenericToAccelPass(const AccelTransformationOptions &options) {
+    this->accelSize = options.accelSize;
+    this->accelSizes = options.accelSizes;
+    this->dmaAddress = options.dmaAddress;
+    this->dmaInputAddress = options.dmaInputAddress;
+    this->dmaInputBufferSize = options.dmaInputBufferSize;
+    this->dmaOutputAddress = options.dmaOutputAddress;
+    this->dmaOutputBufferSize = options.dmaOutputAddress;
+    this->accOnCpu = options.accOnCpu;
+    this->flowCpuAcc = options.flowCpuAcc; // TODO: will be deprecated
+    this->numberOfCaches = options.numberOfCaches;
+    this->cacheSizes = options.cacheSizes;
+    this->tileSizes = options.tileSizes;
+    this->elementSize = options.elementSize;
+    this->loopPermutation = options.loopPermutation;
+    this->anchorFuncName = options.anchorFuncName;
+    this->anchorOpName = options.anchorOpName;
+    this->anchorFilterName = options.anchorFilterName;
+    this->opcodeMap = options.opcodeMap;
+    this->initFlow = options.initFlow;
+    this->opcodeFlow = options.opcodeFlow;
+  }
+
+  void runOnOperation() override;
+
+  void setOptions(AccelTransformationOptions &options) {
+    options.accelSize = this->accelSize;
+    options.accelSizes = this->accelSizes;
+    options.dmaAddress = this->dmaAddress;
+    options.dmaInputAddress = this->dmaInputAddress;
+    options.dmaInputBufferSize = this->dmaInputBufferSize;
+    options.dmaOutputAddress = this->dmaOutputAddress;
+    options.dmaOutputBufferSize = this->dmaOutputBufferSize;
+    options.accOnCpu = this->accOnCpu;
+    options.flowCpuAcc = this->flowCpuAcc; // TODO: will be deprecated
+    options.numberOfCaches = this->numberOfCaches;
+    options.cacheSizes = this->cacheSizes;
+    options.tileSizes = this->tileSizes;
+    options.elementSize = this->elementSize;
+    options.loopPermutation = this->loopPermutation;
+    options.anchorFuncName = this->anchorFuncName;
+    options.anchorOpName = this->anchorOpName;
+    options.anchorFilterName = this->anchorFilterName;
+    options.opcodeMap = this->opcodeMap;
+    options.initFlow = this->initFlow;
+    options.opcodeFlow = this->opcodeFlow;
+  }
+};
+} // namespace
+
+/// The conversion takes the following steps:
+///   1. Marks anchor ops with the "generalize" attribute
+///   2. Generalizes the marked ops, marking the Ops with the "ACCEL" attribute
+///   3. Annotate attributes to the marked ops
+///   4. Convert the marked ops to the accel dialect
+void ConvertLinalgGenericToAccelPass::runOnOperation() {
+
+  AccelTransformationOptions options;
+  setOptions(options);
+
+  auto module = getOperation();
+  MLIRContext *ctx = &getContext();
+
+  // 1. Marks anchor ops with the "GENERALIZE" or "ANNOTATE" attribute
+  module.walk([&](FuncOp functionOp) {
+    if (!anchorFuncName.empty() && anchorFuncName != functionOp.getName())
+      return;
+
+    functionOp.walk([&](linalg::LinalgOp op) {
+      if (!anchorFilterName.empty()) {
+        // Skip this op if the LinalgOp has kAccelTransformMarker that is not
+        // equal to anchorFilterName
+        if (op->getAttr(kAccelTransformMarker) !=
+            StringAttr::get(ctx, anchorFilterName)) {
+          return;
+        }
+      }
+
+      if ((op->getAttr(kLinalgTransformMarker) !=
+           StringAttr::get(ctx, "ACCELERATE"))) {
+        if ((anchorOpName != op->getName().getStringRef()))
+          return;
+      }
+
+      if (isa<linalg::GenericOp>(op)) {
+        op->setAttr(kLinalgTransformMarker,
+                    StringAttr::get(&getContext(), "ANNOTATE"));
+      } else {
+        op->setAttr(kLinalgTransformMarker,
+                    StringAttr::get(&getContext(), "GENERALIZE"));
+      }
+    });
+  });
+
+  // 2. Generalizes the marked ops, marking the Ops with the next attribute in
+  // the FSM. Uses a nested pass manager.
+  PassManager pm(module.getContext());
+  linalg::LinalgTransformationFilter f(StringAttr::get(ctx, "GENERALIZE"),
+                                       StringAttr::get(ctx, "ANNOTATE"));
+  pm.addNestedPass<FuncOp>(
+      mlir::createLinalgStrategyGeneralizePass(anchorOpName, f));
+
+  if (failed(pm.run(module)))
+    signalPassFailure();
+
+  // Using rewrite patterns
+  // 3. Annotate attributes to the marked ops
+  // 4. Convert the marked ops to the accel dialect
+  RewritePatternSet patterns(&getContext());
+  populateLinalgGenericToAccelConversionPatternsWithOptions(patterns, options);
+
+  ConversionTarget target(getContext());
+  // clang-format off
+  target.addLegalDialect<linalg::LinalgDialect,
+                         AffineDialect,
+                         scf::SCFDialect,
+                         memref::MemRefDialect, 
+                         accel::AccelDialect, 
+                         arith::ArithmeticDialect, 
+                         BuiltinDialect,
+                         StandardOpsDialect>();
+  // clang-format on
+  target.addDynamicallyLegalOp<linalg::GenericOp>(
+      [&](linalg::GenericOp op) -> bool {
+        MLIRContext *ctx = &getContext();
+        SmallVector<StringRef, 8> markers = {
+            "GENERALIZE", "ANNOTATE", "INTERCHANGE", "MEM", "L3", "L2", "L1"};
+
+        auto aMarkerMatchesAttr = [&](const Attribute &attr) -> bool {
+          // Acts like an OR operation, returns true in the first match
+          for (auto marker : markers) {
+            // TODO: Could be made more efficient by casting attr to StringAttr
+            if (StringAttr::get(ctx, marker) == attr)
+              return true;
+          }
+          return false;
+        };
+
+        return !(aMarkerMatchesAttr(op->getAttr(kLinalgTransformMarker)));
+      });
+  if (failed(applyPartialConversion(module, target, std::move(patterns))))
+    signalPassFailure();
+
+  RewritePatternSet patterns2(&getContext());
+  populateLinalgGenericToAccelConversionPatterns(patterns2);
+  target.addDynamicallyLegalOp<linalg::GenericOp>(
+      [&](linalg::GenericOp op) -> bool {
+        auto marker = StringAttr::get(&getContext(), "GENACCEL");
+        return !((op->getAttr(kLinalgTransformMarker) == marker));
+      });
+  if (failed(applyPartialConversion(module, target, std::move(patterns2))))
+    signalPassFailure();
+}
+
+std::unique_ptr<OperationPass<ModuleOp>>
+mlir::createConvertLinalgGenericToAccelPass() {
+  return std::make_unique<ConvertLinalgGenericToAccelPass>();
+}
+
+// std::unique_ptr<OperationPass<ModuleOp>>
+// mlir::createConvertLinalgGenericToAccelPass(
+//     const AccelTransformationOptions &options) {
+//   return std::make_unique<ConvertLinalgGenericToAccelPass>(options);
+// }
diff --git a/lib/Conversion/PassDetail.h b/lib/Conversion/PassDetail.h
index 36ce500..de20815 100644
--- a/lib/Conversion/PassDetail.h
+++ b/lib/Conversion/PassDetail.h
@@ -29,6 +29,14 @@ namespace scf {
 class SCFDialect;
 } // end namespace scf
 
+namespace memref {
+class MemRefDialect;
+} // namespace memref
+
+namespace LLVM {
+class LLVMDialect;
+} // namespace LLVM
+
 #define GEN_PASS_CLASSES
 #include "soda/Conversion/Passes.h.inc"
 
diff --git a/lib/Dialect/Accel/CMakeLists.txt b/lib/Dialect/Accel/CMakeLists.txt
new file mode 100644
index 0000000..f33061b
--- /dev/null
+++ b/lib/Dialect/Accel/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(IR)
diff --git a/lib/Dialect/Accel/IR/AccelDialect.cpp b/lib/Dialect/Accel/IR/AccelDialect.cpp
new file mode 100644
index 0000000..02703da
--- /dev/null
+++ b/lib/Dialect/Accel/IR/AccelDialect.cpp
@@ -0,0 +1,36 @@
+//===- AccelDialect.cpp - MLIR dialect for Accel implementation -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Transforms/InliningUtils.h"
+#include "soda/Dialect/Accel/IR/Accel.h"
+
+using namespace mlir;
+using namespace mlir::accel;
+
+#include "soda/Dialect/Accel/IR/AccelOpsDialect.cpp.inc"
+
+namespace {
+/// This class defines the interface for handling inlining with accel
+/// operations.
+struct AccelInlinerInterface : public DialectInlinerInterface {
+  using DialectInlinerInterface::DialectInlinerInterface;
+
+  /// All operations within accel ops can be inlined.
+  bool isLegalToInline(Operation *, Region *, bool, IRMapping &) const final {
+    return true;
+  }
+};
+} // namespace
+
+void mlir::accel::AccelDialect::initialize() {
+  addOperations<
+#define GET_OP_LIST
+#include "soda/Dialect/Accel/IR/AccelOps.cpp.inc"
+      >();
+  addInterfaces<AccelInlinerInterface>();
+}
diff --git a/lib/Dialect/Accel/IR/AccelOps.cpp b/lib/Dialect/Accel/IR/AccelOps.cpp
new file mode 100644
index 0000000..f92fb1c
--- /dev/null
+++ b/lib/Dialect/Accel/IR/AccelOps.cpp
@@ -0,0 +1,20 @@
+//===- AccelOps.cpp - MLIR operations for accel implementation ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/Builders.h"
+#include "soda/Dialect/Accel/IR/Accel.h"
+
+using namespace mlir;
+using namespace mlir::accel;
+
+//===----------------------------------------------------------------------===//
+// TableGen'd op method definitions
+//===----------------------------------------------------------------------===//
+
+#define GET_OP_CLASSES
+#include "soda/Dialect/Accel/IR/AccelOps.cpp.inc"
diff --git a/lib/Dialect/Accel/IR/CMakeLists.txt b/lib/Dialect/Accel/IR/CMakeLists.txt
new file mode 100644
index 0000000..5576b92
--- /dev/null
+++ b/lib/Dialect/Accel/IR/CMakeLists.txt
@@ -0,0 +1,14 @@
+add_mlir_dialect_library(SODAAccelDialect
+  AccelOps.cpp
+  AccelDialect.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/soda/Dialect/Accel
+
+  DEPENDS
+  MLIRAccelOpsIncGen
+
+  LINK_LIBS PUBLIC
+  MLIRDialect
+  MLIRIR
+)
diff --git a/lib/Dialect/Affine/Transforms/AffineDataCopyGen.cpp b/lib/Dialect/Affine/Transforms/AffineDataCopyGen.cpp
index be482e9..2c950ef 100644
--- a/lib/Dialect/Affine/Transforms/AffineDataCopyGen.cpp
+++ b/lib/Dialect/Affine/Transforms/AffineDataCopyGen.cpp
@@ -39,7 +39,6 @@
 #include "llvm/Support/Debug.h"
 #include <algorithm>
 
-
 #define DEBUG_TYPE "soda-affine-data-copy-gen"
 
 using namespace mlir;
@@ -73,7 +72,7 @@ struct AffineDataCopyGen
   }
 
   void runOnOperation() override;
-  LogicalResult runOnBlock(Block *block, DenseSet<Operation *> &copyNests);
+  void runOnBlock(Block *block, DenseSet<Operation *> &copyNests);
 
   // Constant zero index to avoid too many duplicates.
   Value zeroIndex = nullptr;
@@ -98,10 +97,10 @@ mlir::soda::createAffineDataCopyGenPass(
 /// ranges: each range is either a sequence of one or more operations starting
 /// and ending with an affine load or store op, or just an affine.forop (which
 /// could have other affine for op's nested within).
-LogicalResult AffineDataCopyGen::runOnBlock(Block *block,
-                                            DenseSet<Operation *> &copyNests) {
+void AffineDataCopyGen::runOnBlock(Block *block,
+                                   DenseSet<Operation *> &copyNests) {
   if (block->empty())
-    return success();
+    return;
 
   uint64_t fastMemCapacityBytes =
       fastMemoryCapacity != std::numeric_limits<uint64_t>::max()
@@ -111,7 +110,7 @@ LogicalResult AffineDataCopyGen::runOnBlock(Block *block,
                                    fastMemorySpace, tagMemorySpace,
                                    fastMemCapacityBytes};
 
-  // Every affine.forop in the block starts and ends a block range for copying;
+  // Every affine.for op in the block starts and ends a block range for copying;
   // in addition, a contiguous sequence of operations starting with a
   // load/store op but not including any copy nests themselves is also
   // identified as a copy block range. Straightline code (a contiguous chunk of
@@ -160,7 +159,7 @@ LogicalResult AffineDataCopyGen::runOnBlock(Block *block,
       if (recurseInner) {
         // We'll recurse and do the copies at an inner level for 'forInst'.
         // Recurse onto the body of this loop.
-        (void)runOnBlock(forOp.getBody(), copyNests);
+        runOnBlock(forOp.getBody(), copyNests);
       } else {
         // We have enough capacity, i.e., copies will be computed for the
         // portion of the block until 'it', and for 'it', which is 'forOp'. Note
@@ -198,8 +197,6 @@ LogicalResult AffineDataCopyGen::runOnBlock(Block *block,
                                  /*end=*/std::prev(block->end()), copyOptions,
                                  /*filterMemRef=*/std::nullopt, copyNests);
   }
-
-  return success();
 }
 
 void AffineDataCopyGen::runOnOperation() {
@@ -215,7 +212,7 @@ void AffineDataCopyGen::runOnOperation() {
   copyNests.clear();
 
   for (auto &block : f)
-    (void)runOnBlock(&block, copyNests);
+    runOnBlock(&block, copyNests);
 
   // Promote any single iteration loops in the copy nests and collect
   // load/stores to simplify.
@@ -237,5 +234,6 @@ void AffineDataCopyGen::runOnOperation() {
   AffineLoadOp::getCanonicalizationPatterns(patterns, &getContext());
   AffineStoreOp::getCanonicalizationPatterns(patterns, &getContext());
   FrozenRewritePatternSet frozenPatterns(std::move(patterns));
-  (void)applyOpPatternsAndFold(copyOps, frozenPatterns, /*strict=*/true);
+  (void)applyOpPatternsAndFold(copyOps, frozenPatterns,
+                               GreedyRewriteStrictness::ExistingAndNewOps);
 }
diff --git a/lib/Dialect/CMakeLists.txt b/lib/Dialect/CMakeLists.txt
index bbe6311..a1af34b 100644
--- a/lib/Dialect/CMakeLists.txt
+++ b/lib/Dialect/CMakeLists.txt
@@ -2,4 +2,5 @@ add_subdirectory(SODA)
 add_subdirectory(SNN)
 add_subdirectory(Linalg)
 add_subdirectory(Affine)
+add_subdirectory(Accel)
 add_subdirectory(Transform)
\ No newline at end of file
diff --git a/lib/Dialect/Linalg/Transforms/Tiling.cpp b/lib/Dialect/Linalg/Transforms/Tiling.cpp
index be8fd49..cf22ef8 100644
--- a/lib/Dialect/Linalg/Transforms/Tiling.cpp
+++ b/lib/Dialect/Linalg/Transforms/Tiling.cpp
@@ -58,7 +58,8 @@ parseTilingString(ModuleOp &module, MLIRContext *context,
       transform.sequence failures(propagate) {
       ^bb0(%arg1: !pdl.operation):
         %0 = transform.structured.match ops{["<anchor-op>"]} in %arg1
-        %1, %loops:<tileNDims> = transform.structured.tile %0 [<tileSizes>]
+        %1, %loops:<tileNDims> = transform.structured.tile %0 [<tileSizes>] : 
+          (!pdl.operation) -> (<pdlOutTypesStr>)
       }
     )MLIR";
 
@@ -81,6 +82,18 @@ parseTilingString(ModuleOp &module, MLIRContext *context,
   std::string tileNDimsStr = std::to_string(tileSizes.size());
   str = str.replace(str.find("<tileNDims>"), 11, tileNDimsStr);
 
+  // replace <pdlOutTypesStr> with the correct number of !pdl.operation,...
+  std::string pdlOutTypeStr = "";
+  // the number of types is given by the number of for loops + 1
+  for (size_t i = 0; i < tileSizes.size() + 1; i++) {
+    pdlOutTypeStr += "!pdl.operation";
+    if (i != tileSizes.size()) {
+      pdlOutTypeStr += ", ";
+    }
+  }
+  // perform string replacement
+  str = str.replace(str.find("<pdlOutTypesStr>"), 16, pdlOutTypeStr);
+
   // Parse the string
   return parseSourceString(str, module, context);
 }
@@ -119,7 +132,8 @@ parseTilingString(ModuleOp &module, MLIRContext *context,
 //         // todo: create tile op
 //         // SmallVector<int64_t, 2> tileSizes = {4, 4};
 //         // auto tiletoScfForOp =
-//         //     b.create<transform::TileOp>(loc, matchOp.getResult(), tileSizes);
+//         //     b.create<transform::TileOp>(loc, matchOp.getResult(),
+//         tileSizes);
 //         // auto forLoops = tiletoScfForOp.getLoops();
 //         // auto tiledOpH = tiletoScfForOp.getTiledLinalgOp();
 
diff --git a/lib/Dialect/SODA/Transforms/KernelGeneration.cpp b/lib/Dialect/SODA/Transforms/KernelGeneration.cpp
index e850e28..9d39e92 100644
--- a/lib/Dialect/SODA/Transforms/KernelGeneration.cpp
+++ b/lib/Dialect/SODA/Transforms/KernelGeneration.cpp
@@ -17,9 +17,9 @@
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/IR/BlockAndValueMapping.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/FunctionInterfaces.h"
+#include "mlir/IR/IRMapping.h"
 #include "mlir/IR/SymbolTable.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/RegionUtils.h"
@@ -72,7 +72,7 @@ void SodaKernelGenerationPass::runOnOperation() {
       return signalPassFailure();
     }
 
-    BlockAndValueMapping map;
+    IRMapping map;
     sodaOp.getRegion().cloneInto(&(mop.getRegion()), map);
     sodaOp.erase();
 
diff --git a/lib/Dialect/SODA/Transforms/KernelOutlining.cpp b/lib/Dialect/SODA/Transforms/KernelOutlining.cpp
index 867d4bd..a1f606e 100644
--- a/lib/Dialect/SODA/Transforms/KernelOutlining.cpp
+++ b/lib/Dialect/SODA/Transforms/KernelOutlining.cpp
@@ -16,8 +16,8 @@
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/IR/BlockAndValueMapping.h"
 #include "mlir/IR/Builders.h"
+#include "mlir/IR/IRMapping.h"
 #include "mlir/IR/SymbolTable.h"
 #include "mlir/Parser/Parser.h"
 #include "mlir/Support/LLVM.h"
@@ -98,7 +98,7 @@ LogicalResult mlir::sinkOperationsIntoLaunchOp(soda::LaunchOp launchOp) {
   }
 
   // Insert operations so that the defs get cloned before uses.
-  BlockAndValueMapping map;
+  IRMapping map;
   OpBuilder builder(launchOpBody);
   for (Operation *op : toBeSunk) {
     Operation *clonedOp = builder.clone(*op, map);
@@ -137,7 +137,7 @@ outlineKernelFuncImpl(soda::LaunchOp launchOp, StringRef kernelFnName,
   auto outlinedFunc = builder.create<soda::SODAFuncOp>(loc, kernelFnName, type);
   outlinedFunc->setAttr(soda::SODADialect::getKernelFuncAttrName(),
                         builder.getUnitAttr());
-  BlockAndValueMapping map;
+  IRMapping map;
 
   // Map the arguments corresponding to the launch parameter like blockIdx,
   // threadIdx, etc.
diff --git a/lib/ExecutionEngine/CMakeLists.txt b/lib/ExecutionEngine/CMakeLists.txt
new file mode 100644
index 0000000..6e8e666
--- /dev/null
+++ b/lib/ExecutionEngine/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(axi)
\ No newline at end of file
diff --git a/lib/ExecutionEngine/axi/AxiUtils.cpp b/lib/ExecutionEngine/axi/AxiUtils.cpp
new file mode 100644
index 0000000..a8a4135
--- /dev/null
+++ b/lib/ExecutionEngine/axi/AxiUtils.cpp
@@ -0,0 +1,200 @@
+//===- AxiUtils.cpp - AXI4MLIR  implementation ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements wrapper AXI4MLIR library calls. These are the calls
+// visible to the MLIR.
+//
+//===----------------------------------------------------------------------===//
+
+#include "soda/ExecutionEngine/axi/AxiUtils.h"
+
+#include "soda/ExecutionEngine/axi/api_v1.h"
+
+struct dma myDMA;
+// =============================================================================
+// AXI_APIV1
+// =============================================================================
+
+extern "C" void dma_init(unsigned int dma_address,
+                         unsigned int dma_input_address,
+                         unsigned int dma_input_buffer_size,
+                         unsigned int dma_output_address,
+                         unsigned int dma_output_buffer_size) {
+  D(std::cout << "Called: " << __func__ << " not mock version" << std::endl;
+    std::cout << "\t" << dma_address << std::endl;
+    std::cout << "\t" << dma_input_address << std::endl;
+    std::cout << "\t" << dma_input_buffer_size << std::endl;
+    std::cout << "\t" << dma_output_address << std::endl;
+    std::cout << "\t" << dma_output_buffer_size << std::endl;);
+
+  myDMA.dma_init(dma_address, dma_input_address, dma_input_buffer_size,
+                 dma_output_address, dma_output_buffer_size);
+  return;
+}
+
+// V2 implementation
+// extern "C" void dma_init(unsigned int dma_address,
+//                          unsigned int dma_input_address,
+//                          unsigned int dma_input_buffer_size, unsigned int
+//                          isize, unsigned int dma_output_address, unsigned int
+//                          dma_output_buffer_size, unsigned int osize) {
+//   D(std::cout << "Called: " << __func__ << " not mock version" << std::endl;
+//     std::cout << "\t" << dma_address << std::endl;
+//     std::cout << "\t" << dma_input_address << std::endl;
+//     std::cout << "\t" << dma_input_buffer_size << std::endl;
+//     std::cout << "\t" << isize << std::endl;
+//     std::cout << "\t" << dma_output_address << std::endl;
+//     std::cout << "\t" << dma_output_buffer_size << std::endl;
+//     std::cout << "\t" << osize << std::endl;);
+
+//   myDMA.dma_init(dma_address, dma_input_address, dma_input_buffer_size,
+//   isize,
+//                  dma_output_address, dma_output_buffer_size, osize);
+//   return;
+// }
+
+extern "C" void dma_free() {
+  D(std::cout << "Called: " << __func__ << " not mock version" << std::endl;);
+  myDMA.dma_free();
+}
+
+extern "C" unsigned int *dma_get_inbuffer() {
+  D(std::cout << "Called: " << __func__ << " not mock version" << std::endl;);
+  return myDMA.dma_get_inbuffer();
+}
+
+extern "C" unsigned int *dma_get_outbuffer() {
+  D(std::cout << "Called: " << __func__ << " not mock version" << std::endl;);
+  return myDMA.dma_get_outbuffer();
+}
+
+// V2 implementation
+// extern "C" char *dma_get_inbuffer() {
+//   D(std::cout << "Called: " << __func__ << " not mock version" <<
+//   std::endl;); return myDMA.dma_get_inbuffer();
+// }
+
+// extern "C" char *dma_get_outbuffer() {
+//   D(std::cout << "Called: " << __func__ << " not mock version" <<
+//   std::endl;); return myDMA.dma_get_outbuffer();
+// }
+
+extern "C" int dma_copy_to_inbuffer(unsigned int *host_src_address,
+                                    int data_length, int offset) {
+  D(std::cout << "Called: " << __func__ << " not mock version" << std::endl;);
+  return myDMA.dma_copy_to_inbuffer(host_src_address, data_length, offset);
+}
+
+extern "C" int dma_copy_from_outbuffer(unsigned int *host_dst_address,
+                                       int data_length, int offset) {
+  D(std::cout << "Called: " << __func__ << " not mock version" << std::endl;);
+  return myDMA.dma_copy_from_outbuffer(host_dst_address, data_length, offset);
+}
+
+template <typename T>
+int mlir_dma_copy_to_inbuffer(const DynamicMemRefType<T> &src, int data_length,
+                              int offset) {
+  myDMA.mlir_dma_copy_to_inbuffer(src.data, src.rank, src.rank, src.offset,
+                                  src.sizes, src.strides, offset);
+  return 0;
+}
+
+extern "C" int _mlir_ciface_copy_to_inbuffer_f32(UnrankedMemRefType<float> *M,
+                                                 int offset) {
+  mlir_dma_copy_to_inbuffer(DynamicMemRefType<float>(*M), 0, offset);
+  return 0;
+}
+
+extern "C" int copy_to_inbuffer_f32(int64_t rank, void *ptr, int offset) {
+  UnrankedMemRefType<float> descriptor = {rank, ptr};
+  return _mlir_ciface_copy_to_inbuffer_f32(&descriptor, offset);
+}
+
+extern "C" int _mlir_ciface_copy_to_inbuffer_i32(UnrankedMemRefType<int> *M,
+                                                 int offset) {
+  mlir_dma_copy_to_inbuffer(DynamicMemRefType<int>(*M), 0, offset);
+  return 0;
+}
+
+extern "C" int copy_to_inbuffer_i32(int64_t rank, void *ptr, int offset) {
+  UnrankedMemRefType<int> descriptor = {rank, ptr};
+  return _mlir_ciface_copy_to_inbuffer_i32(&descriptor, offset);
+}
+
+extern "C" int
+_mlir_ciface_copy_from_outbuffer_f32(UnrankedMemRefType<float> *M, int offset) {
+  mlir_dma_copy_from_outbuffer(DynamicMemRefType<float>(*M), 0, offset);
+  return 0;
+}
+
+extern "C" int copy_from_outbuffer_f32(int64_t rank, void *ptr, int offset) {
+  UnrankedMemRefType<float> descriptor = {rank, ptr};
+  return _mlir_ciface_copy_from_outbuffer_f32(&descriptor, offset);
+}
+
+extern "C" int _mlir_ciface_copy_from_outbuffer_i32(UnrankedMemRefType<int> *M,
+                                                    int offset) {
+  mlir_dma_copy_from_outbuffer(DynamicMemRefType<int>(*M), 0, offset);
+  return 0;
+}
+
+extern "C" int copy_from_outbuffer_i32(int64_t rank, void *ptr, int offset) {
+  UnrankedMemRefType<int> descriptor = {rank, ptr};
+  return _mlir_ciface_copy_from_outbuffer_i32(&descriptor, offset);
+}
+
+template <typename T>
+int mlir_dma_copy_from_outbuffer(const DynamicMemRefType<T> &dst,
+                                 int data_length, int offset) {
+  D(std::cout << "Called: " << __func__ << " not mock version" << std::endl;);
+  myDMA.mlir_dma_copy_from_outbuffer(dst.data, dst.rank, dst.rank, dst.offset,
+                                     dst.sizes, dst.strides, offset);
+  return 0;
+}
+
+extern "C" int dma_start_send(int length, int offset) {
+  D(std::cout << "Called: " << __func__ << " not mock version" << std::endl;);
+  return myDMA.dma_start_send(length, offset);
+}
+
+extern "C" int dma_check_send() {
+  D(std::cout << "Called: " << __func__ << " not mock version" << std::endl;);
+  return 0;
+}
+
+extern "C" void dma_wait_send() {
+  D(std::cout << "Called: " << __func__ << " not mock version" << std::endl;);
+  myDMA.dma_wait_send();
+}
+
+extern "C" int dma_start_recv(int length, int offset) {
+  D(std::cout << "Called: " << __func__ << " not mock version" << std::endl;);
+  return myDMA.dma_start_recv(length, offset);
+}
+
+extern "C" void dma_wait_recv() {
+  D(std::cout << "Called: " << __func__ << " not mock version" << std::endl;);
+  myDMA.dma_wait_recv();
+}
+
+extern "C" int dma_check_recv() {
+  D(std::cout << "Called: " << __func__ << " not mock version" << std::endl;);
+  return myDMA.dma_check_recv();
+}
+
+extern "C" unsigned int dma_set(unsigned int *dma_virtual_address, int offset,
+                                unsigned int value) {
+  D(std::cout << "Called: " << __func__ << " not mock version" << std::endl;);
+  myDMA.dma_set(dma_virtual_address, offset, value);
+  return 0;
+}
+
+extern "C" unsigned int dma_get(unsigned int *dma_virtual_address, int offset) {
+  D(std::cout << "Called: " << __func__ << " not mock version" << std::endl;);
+  return myDMA.dma_get(dma_virtual_address, offset);
+}
diff --git a/lib/ExecutionEngine/axi/AxiUtilsMock.cpp b/lib/ExecutionEngine/axi/AxiUtilsMock.cpp
new file mode 100644
index 0000000..9f18f8f
--- /dev/null
+++ b/lib/ExecutionEngine/axi/AxiUtilsMock.cpp
@@ -0,0 +1,131 @@
+//===- AxiUtils.cpp - AXI4MLIR  implementation ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements wrapper AXI4MLIR library calls. These are the calls
+// visible to the MLIR.
+//
+// This is a mock implementation that only prints to the terminal.
+//
+//===----------------------------------------------------------------------===//
+
+#include "soda/ExecutionEngine/axi/AxiUtils.h"
+
+// =============================================================================
+// AXI_APIV1
+// =============================================================================
+
+extern "C" void dma_init(unsigned int dma_address,
+                         unsigned int dma_input_address,
+                         unsigned int dma_input_buffer_size,
+                         unsigned int dma_output_address,
+                         unsigned int dma_output_buffer_size) {
+  std::cout << "Called: " << __func__ << std::endl;
+  std::cout << "\t" << dma_address << std::endl;
+  std::cout << "\t" << dma_input_address << std::endl;
+  std::cout << "\t" << dma_input_buffer_size << std::endl;
+  std::cout << "\t" << dma_output_address << std::endl;
+  std::cout << "\t" << dma_output_buffer_size << std::endl;
+  std::cout << "Called: " << __func__ << std::endl;
+  return;
+}
+
+extern "C" void dma_free() { std::cout << "Called: " << __func__ << std::endl; }
+
+extern "C" unsigned int *dma_get_inbuffer() {
+  std::cout << "Called: " << __func__ << std::endl;
+  return 0;
+}
+
+extern "C" unsigned int *dma_get_outbuffer() {
+  std::cout << "Called: " << __func__ << std::endl;
+  return 0;
+}
+
+extern "C" int dma_copy_to_inbuffer(unsigned int *host_src_address,
+                                    int data_length, int offset) {
+  std::cout << "Called: " << __func__ << std::endl;
+  return 0;
+}
+
+extern "C" int dma_copy_from_outbuffer(unsigned int *host_dst_address,
+                                       int data_length, int offset) {
+  std::cout << "Called: " << __func__ << std::endl;
+  return 0;
+}
+
+template <typename T>
+int mlir_dma_copy_to_inbuffer(const DynamicMemRefType<T> &src, int data_length,
+                              int offset) {
+  std::cout << "Called: " << __func__ << std::endl;
+  return 0;
+}
+
+extern "C" int _mlir_ciface_copy_to_inbuffer_f32(UnrankedMemRefType<float> *M,
+                                                 int offset) {
+  mlir_dma_copy_to_inbuffer(DynamicMemRefType<float>(*M), 0, offset);
+  return 0;
+}
+
+extern "C" int copy_to_inbuffer_f32(int64_t rank, void *ptr, int offset) {
+  UnrankedMemRefType<float> descriptor = {rank, ptr};
+  return 0;
+}
+
+extern "C" int copy_to_inbuffer_i32(int64_t rank, void *ptr, int offset) {
+  UnrankedMemRefType<int> descriptor = {rank, ptr};
+  return 0;
+}
+
+extern "C" int copy_from_outbuffer_f32(int64_t rank, void *ptr, int offset) {
+  UnrankedMemRefType<float> descriptor = {rank, ptr};
+  return 0;
+}
+
+extern "C" int copy_from_outbuffer_i32(int64_t rank, void *ptr, int offset) {
+  UnrankedMemRefType<int> descriptor = {rank, ptr};
+  return 0;
+}
+
+extern "C" int dma_start_send(int length, int offset) {
+  std::cout << "Called: " << __func__ << std::endl;
+  return 0;
+}
+
+extern "C" int dma_check_send() {
+  std::cout << "Called: " << __func__ << std::endl;
+  return 0;
+}
+
+extern "C" void dma_wait_send() {
+  std::cout << "Called: " << __func__ << std::endl;
+}
+
+extern "C" int dma_start_recv(int length, int offset) {
+  std::cout << "Called: " << __func__ << std::endl;
+  return 0;
+}
+
+extern "C" void dma_wait_recv() {
+  std::cout << "Called: " << __func__ << std::endl;
+}
+
+extern "C" int dma_check_recv() {
+  std::cout << "Called: " << __func__ << std::endl;
+  return 0;
+}
+
+extern "C" unsigned int dma_set(unsigned int *dma_virtual_address, int offset,
+                                unsigned int value) {
+  std::cout << "Called: " << __func__ << std::endl;
+  return 0;
+}
+
+extern "C" unsigned int dma_get(unsigned int *dma_virtual_address, int offset) {
+  std::cout << "Called: " << __func__ << std::endl;
+  return 0;
+}
diff --git a/lib/ExecutionEngine/axi/AxiUtilsSysc.cpp b/lib/ExecutionEngine/axi/AxiUtilsSysc.cpp
new file mode 100644
index 0000000..23e2a99
--- /dev/null
+++ b/lib/ExecutionEngine/axi/AxiUtilsSysc.cpp
@@ -0,0 +1,189 @@
+//===- AxiUtils.cpp - AXI4MLIR  implementation ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements wrapper AXI4MLIR library calls. These are the calls
+// visible to the MLIR.
+//
+//===----------------------------------------------------------------------===//
+
+#include "soda/ExecutionEngine/axi/AxiUtils.h"
+
+#include "soda/ExecutionEngine/axi/api_v1.h"
+
+// =============================================================================
+// AXI_APIV1
+// =============================================================================
+
+struct dma myDMA;
+
+extern "C" void dma_init(unsigned int dma_address,
+                         unsigned int dma_input_address,
+                         unsigned int dma_input_buffer_size,
+                         unsigned int dma_output_address,
+                         unsigned int dma_output_buffer_size) {
+  // std::cout << "Called: " << __func__ << " sysc version" << std::endl;
+  // std::cout << "\t" << dma_address << std::endl;
+  // std::cout << "\t" << dma_input_address << std::endl;
+  // std::cout << "\t" << dma_input_buffer_size << std::endl;
+  // std::cout << "\t" << dma_output_address << std::endl;
+  // std::cout << "\t" << dma_output_buffer_size << std::endl;
+  LOG("Called: " << __func__ << " sysc version");
+  LOG("\t" << dma_address);
+  LOG("\t" << dma_input_address);
+  LOG("\t" << dma_input_buffer_size);
+  LOG("\t" << dma_output_address);
+  LOG("\t" << dma_output_buffer_size);
+
+  myDMA.dma_init(dma_address, dma_input_address, dma_input_buffer_size,
+                 dma_output_address, dma_output_buffer_size);
+  return;
+}
+
+extern "C" void dma_free() {
+  // std::cout << "Called: " << __func__ << " sysc version" << std::endl;
+  LOG("Called: " << __func__ << " sysc version");
+  myDMA.dma_free();
+}
+
+extern "C" unsigned int *dma_get_inbuffer() {
+  // std::cout << "Called: " << __func__ << " sysc version" << std::endl;
+  LOG("Called: " << __func__ << " sysc version");
+  return myDMA.dma_get_inbuffer();
+}
+
+extern "C" unsigned int *dma_get_outbuffer() {
+  // std::cout << "Called: " << __func__ << " sysc version" << std::endl;
+  LOG("Called: " << __func__ << " sysc version");
+  return myDMA.dma_get_outbuffer();
+}
+
+extern "C" int dma_copy_to_inbuffer(unsigned int *host_src_address,
+                                    int data_length, int offset) {
+  // std::cout << "Called: " << __func__ << " sysc version" << std::endl;
+  LOG("Called: " << __func__ << " sysc version");
+  return myDMA.dma_copy_to_inbuffer(host_src_address, data_length, offset);
+}
+
+extern "C" int dma_copy_from_outbuffer(unsigned int *host_dst_address,
+                                       int data_length, int offset) {
+  // std::cout << "Called: " << __func__ << " sysc version" << std::endl;
+  LOG("Called: " << __func__ << " sysc version");
+  return myDMA.dma_copy_from_outbuffer(host_dst_address, data_length, offset);
+}
+
+template <typename T>
+int mlir_dma_copy_to_inbuffer(const DynamicMemRefType<T> &src, int data_length,
+                              int offset) {
+  myDMA.mlir_dma_copy_to_inbuffer(src.data, src.rank, src.rank, src.offset,
+                                  src.sizes, src.strides, offset);
+  return 0;
+}
+
+extern "C" int _mlir_ciface_copy_to_inbuffer_f32(UnrankedMemRefType<float> *M,
+                                                 int offset) {
+  mlir_dma_copy_to_inbuffer(DynamicMemRefType<float>(*M), 0, offset);
+  return 0;
+}
+
+extern "C" int copy_to_inbuffer_f32(int64_t rank, void *ptr, int offset) {
+  UnrankedMemRefType<float> descriptor = {rank, ptr};
+  return _mlir_ciface_copy_to_inbuffer_f32(&descriptor, offset);
+}
+
+extern "C" int _mlir_ciface_copy_to_inbuffer_i32(UnrankedMemRefType<int> *M,
+                                                 int offset) {
+  mlir_dma_copy_to_inbuffer(DynamicMemRefType<int>(*M), 0, offset);
+  return 0;
+}
+
+extern "C" int copy_to_inbuffer_i32(int64_t rank, void *ptr, int offset) {
+  UnrankedMemRefType<int> descriptor = {rank, ptr};
+  return _mlir_ciface_copy_to_inbuffer_i32(&descriptor, offset);
+}
+
+extern "C" int
+_mlir_ciface_copy_from_outbuffer_f32(UnrankedMemRefType<float> *M, int offset) {
+  mlir_dma_copy_from_outbuffer(DynamicMemRefType<float>(*M), 0, offset);
+  return 0;
+}
+
+extern "C" int copy_from_outbuffer_f32(int64_t rank, void *ptr, int offset) {
+  UnrankedMemRefType<float> descriptor = {rank, ptr};
+  return _mlir_ciface_copy_from_outbuffer_f32(&descriptor, offset);
+}
+
+extern "C" int _mlir_ciface_copy_from_outbuffer_i32(UnrankedMemRefType<int> *M,
+                                                    int offset) {
+  mlir_dma_copy_from_outbuffer(DynamicMemRefType<int>(*M), 0, offset);
+  return 0;
+}
+
+extern "C" int copy_from_outbuffer_i32(int64_t rank, void *ptr, int offset) {
+  UnrankedMemRefType<int> descriptor = {rank, ptr};
+  return _mlir_ciface_copy_from_outbuffer_i32(&descriptor, offset);
+}
+
+template <typename T>
+int mlir_dma_copy_from_outbuffer(const DynamicMemRefType<T> &dst,
+                                 int data_length, int offset) {
+  // std::cout << "Called: " << __func__ << " sysc version" << std::endl;
+  LOG("Called: " << __func__ << " sysc version");
+  myDMA.mlir_dma_copy_from_outbuffer(dst.data, dst.rank, dst.rank, dst.offset,
+                                     dst.sizes, dst.strides, offset);
+  return 0;
+}
+
+extern "C" int dma_start_send(int length, int offset) {
+  // std::cout << "Called: " << __func__ << " sysc version" << std::endl;
+  LOG("Called: " << __func__ << " sysc version");
+  return myDMA.dma_start_send(length, offset);
+}
+
+extern "C" int dma_check_send() {
+  // std::cout << "Called: " << __func__ << " sysc version" << std::endl;
+  LOG("Called: " << __func__ << " sysc version");
+  return 0;
+}
+
+extern "C" void dma_wait_send() {
+  // std::cout << "Called: " << __func__ << " sysc version" << std::endl;
+  LOG("Called: " << __func__ << " sysc version");
+  myDMA.dma_wait_send();
+}
+
+extern "C" int dma_start_recv(int length, int offset) {
+  // std::cout << "Called: " << __func__ << " sysc version" << std::endl;
+  LOG("Called: " << __func__ << " sysc version");
+  return myDMA.dma_start_recv(length, offset);
+}
+
+extern "C" void dma_wait_recv() {
+  // std::cout << "Called: " << __func__ << " sysc version" << std::endl;
+  LOG("Called: " << __func__ << " sysc version");
+  myDMA.dma_wait_recv();
+}
+
+extern "C" int dma_check_recv() {
+  // std::cout << "Called: " << __func__ << " sysc version" << std::endl;
+  LOG("Called: " << __func__ << " sysc version");
+  return myDMA.dma_check_recv();
+}
+
+extern "C" unsigned int dma_set(unsigned int *dma_virtual_address, int offset,
+                                unsigned int value) {
+  // std::cout << "Called: " << __func__ << " sysc version" << std::endl;
+  LOG("Called: " << __func__ << " sysc version");
+  myDMA.dma_set(dma_virtual_address, offset, value);
+  return 0;
+}
+
+extern "C" unsigned int dma_get(unsigned int *dma_virtual_address, int offset) {
+  // std::cout << "Called: " << __func__ << " sysc version" << std::endl;
+  LOG("Called: " << __func__ << " sysc version");
+  return myDMA.dma_get(dma_virtual_address, offset);
+}
diff --git a/lib/ExecutionEngine/axi/CMakeLists.txt b/lib/ExecutionEngine/axi/CMakeLists.txt
new file mode 100644
index 0000000..f3a4adb
--- /dev/null
+++ b/lib/ExecutionEngine/axi/CMakeLists.txt
@@ -0,0 +1,118 @@
+# Exclude these from libMLIR.so because the JIT infrastructure
+# is a big dependency which most don't need.
+
+add_subdirectory(api)
+
+set(LLVM_OPTIONAL_SOURCES
+  AxiUtils.cpp
+  AxiUtilsMock.cpp
+  AxiUtilsSysc.cpp
+)
+
+add_mlir_library(mlir_mockaxi_runner_utils
+  SHARED
+  AxiUtilsMock.cpp
+
+  EXCLUDE_FROM_LIBMLIR
+)
+target_compile_definitions(mlir_mockaxi_runner_utils PRIVATE mlir_mockaxi_runner_utils_EXPORTS)
+
+add_mlir_library(mlir_axi_runner_utils
+  SHARED
+  AxiUtils.cpp
+  EXCLUDE_FROM_LIBMLIR
+)
+target_compile_definitions(mlir_axi_runner_utils PRIVATE mlir_axi_runner_utils_EXPORTS)
+add_dependencies(mlir_axi_runner_utils
+  axi_api_v1
+)
+target_link_libraries(mlir_axi_runner_utils PUBLIC axi_api_v1) # Needed to call implemented functions
+
+# Only generate systemc libraries for native builds
+if(AXI_CROSSCOMPILING)
+  message(STATUS "Cross-compiling, SystemC runner libraries are disabed")
+else()
+  message(STATUS "Not Cross-compiling, SystemC runner libraries are enabled")
+
+  if(DEFINED ENV{SYSTEMC_HOME})
+    message(STATUS "SYSTEMC_HOME is set: $ENV{SYSTEMC_HOME} -- building sysc runner libs")
+
+    add_mlir_library(mlir_syscaxi_runner_utils
+      SHARED
+      AxiUtilsSysc.cpp
+      EXCLUDE_FROM_LIBMLIR
+    )
+    target_compile_definitions(mlir_syscaxi_runner_utils PRIVATE mlir_syscaxi_runner_utils_EXPORTS)
+    add_dependencies(mlir_syscaxi_runner_utils
+      axi_api_v1_sysc
+    )
+    target_link_libraries(mlir_syscaxi_runner_utils PUBLIC axi_api_v1_sysc) # Needed to call implemented functions
+
+    # ----------------------------
+    # Matmul accelerator
+    # Same for accelerator v1
+    add_mlir_library(mlir_syscaxi_runner_utils_accv1
+      SHARED
+      AxiUtilsSysc.cpp
+      EXCLUDE_FROM_LIBMLIR
+    )
+    target_compile_definitions(mlir_syscaxi_runner_utils_accv1 PRIVATE mlir_syscaxi_runner_utils_v1_EXPORTS)
+    add_dependencies(mlir_syscaxi_runner_utils_accv1
+      axi_api_v1_sysc_accv1
+    )
+    target_link_libraries(mlir_syscaxi_runner_utils_accv1 PUBLIC axi_api_v1_sysc_accv1) # Needed to call implemented functions
+
+    # Same for accelerator v2
+    add_mlir_library(mlir_syscaxi_runner_utils_accv2
+      SHARED
+      AxiUtilsSysc.cpp
+      EXCLUDE_FROM_LIBMLIR
+    )
+    target_compile_definitions(mlir_syscaxi_runner_utils_accv2 PRIVATE mlir_syscaxi_runner_utils_v2_EXPORTS)
+    add_dependencies(mlir_syscaxi_runner_utils_accv2
+      axi_api_v1_sysc_accv2
+    )
+    target_link_libraries(mlir_syscaxi_runner_utils_accv2 PUBLIC axi_api_v1_sysc_accv2) # Needed to call implemented functions
+
+    # Same for accelerator v3
+    add_mlir_library(mlir_syscaxi_runner_utils_accv3
+      SHARED
+      AxiUtilsSysc.cpp
+      EXCLUDE_FROM_LIBMLIR
+    )
+    target_compile_definitions(mlir_syscaxi_runner_utils_accv3 PRIVATE mlir_syscaxi_runner_utils_v3_EXPORTS)
+    add_dependencies(mlir_syscaxi_runner_utils_accv3
+      axi_api_v1_sysc_accv3
+    )
+    target_link_libraries(mlir_syscaxi_runner_utils_accv3 PUBLIC axi_api_v1_sysc_accv3) # Needed to call implemented functions
+
+    # Same for accelerator v4
+    add_mlir_library(mlir_syscaxi_runner_utils_accv4
+      SHARED
+      AxiUtilsSysc.cpp
+      EXCLUDE_FROM_LIBMLIR
+    )
+    target_compile_definitions(mlir_syscaxi_runner_utils_accv4 PRIVATE mlir_syscaxi_runner_utils_v4_EXPORTS)
+    add_dependencies(mlir_syscaxi_runner_utils_accv4
+      axi_api_v1_sysc_accv4
+    )
+    target_link_libraries(mlir_syscaxi_runner_utils_accv4 PUBLIC axi_api_v1_sysc_accv4) # Needed to call implemented functions
+
+    # ----------------------------
+    # Conv accelerator
+
+    # Same for accelerator v1
+    add_mlir_library(mlir_syscaxi_runner_utils_conv_accv1
+      SHARED
+      AxiUtilsSysc.cpp
+      EXCLUDE_FROM_LIBMLIR
+    )
+    target_compile_definitions(mlir_syscaxi_runner_utils_conv_accv1 PRIVATE mlir_syscaxi_runner_utils_conv_v1_EXPORTS)
+    add_dependencies(mlir_syscaxi_runner_utils_conv_accv1
+      axi_api_v1_sysc_conv_accv1
+    )
+    target_link_libraries(mlir_syscaxi_runner_utils_conv_accv1 PUBLIC axi_api_v1_sysc_conv_accv1) # Needed to call implemented functions
+  endif()
+endif()
+
+set(CMAKE_CXX_FLAGS "${tmpcxxflags}") # Revert to normal CXX flags
\ No newline at end of file
diff --git a/lib/ExecutionEngine/axi/api/CMakeLists.txt b/lib/ExecutionEngine/axi/api/CMakeLists.txt
new file mode 100644
index 0000000..afd4267
--- /dev/null
+++ b/lib/ExecutionEngine/axi/api/CMakeLists.txt
@@ -0,0 +1,115 @@
+# Exclude these from libMLIR.so because the JIT infrastructure
+# is a big dependency which most don't need.
+
+set(LLVM_OPTIONAL_SOURCES
+  api_v0.cpp
+  api_v1.cpp
+  api_v1_sysc.cpp
+  api_v2.cpp
+  api_v2_sysc.cpp
+)
+
+add_mlir_library(axi_api_v0
+  SHARED
+  api_v0.cpp
+
+  EXCLUDE_FROM_LIBMLIR
+)
+
+add_mlir_library(axi_api_v1
+  SHARED
+  api_v1.cpp
+
+  EXCLUDE_FROM_LIBMLIR
+)
+
+set(tmpcxxflags ${CMAKE_CXX_FLAGS})
+string(REPLACE "-Werror=global-constructors" "" FIXED ${CMAKE_CXX_FLAGS})
+string(REPLACE "-Wcast-qual" "-Wno-vla-extension" FIXED ${FIXED})
+
+if(AXI_CROSSCOMPILING)
+  message(STATUS "Cross-compiling, SystemC api libraries are disabed")
+  string(APPEND FIXED " -mfpu=neon")
+else()
+  message(STATUS "Not Cross-compiling, SystemC api libraries are enabled")
+
+  if(DEFINED ENV{SYSTEMC_HOME})
+    message(STATUS "SYSTEMC_HOME is set: $ENV{SYSTEMC_HOME} -- building sysc api libs")
+
+    add_mlir_library(axi_api_v1_sysc
+      SHARED
+      api_v1_sysc.cpp
+
+      EXCLUDE_FROM_LIBMLIR
+    )
+    target_include_directories(axi_api_v1_sysc PUBLIC $ENV{SYSTEMC_HOME}/include/)
+    set_target_properties(axi_api_v1_sysc PROPERTIES COMPILE_FLAGS "")
+    target_link_libraries(axi_api_v1_sysc PUBLIC $ENV{SYSTEMC_HOME}/lib-linux64/libsystemc.a)
+
+    # ----------------------------
+    # Matmul accelerator
+
+    # Same for accelerator v1
+    add_mlir_library(axi_api_v1_sysc_accv1
+      SHARED
+      api_v1_sysc.cpp
+
+      EXCLUDE_FROM_LIBMLIR
+    )
+    target_include_directories(axi_api_v1_sysc_accv1 PUBLIC $ENV{SYSTEMC_HOME}/include/)
+    set_target_properties(axi_api_v1_sysc_accv1 PROPERTIES COMPILE_FLAGS "-DACC_V1")
+    target_link_libraries(axi_api_v1_sysc_accv1 PUBLIC $ENV{SYSTEMC_HOME}/lib-linux64/libsystemc.a)
+
+    # Same for accelerator v2
+    add_mlir_library(axi_api_v1_sysc_accv2
+      SHARED
+      api_v1_sysc.cpp
+
+      EXCLUDE_FROM_LIBMLIR
+    )
+    target_include_directories(axi_api_v1_sysc_accv2 PUBLIC $ENV{SYSTEMC_HOME}/include/)
+    set_target_properties(axi_api_v1_sysc_accv2 PROPERTIES COMPILE_FLAGS "-DACC_V2")
+    target_link_libraries(axi_api_v1_sysc_accv2 PUBLIC $ENV{SYSTEMC_HOME}/lib-linux64/libsystemc.a)
+
+    # Same for accelerator v3
+    add_mlir_library(axi_api_v1_sysc_accv3
+      SHARED
+      api_v1_sysc.cpp
+
+      EXCLUDE_FROM_LIBMLIR
+    )
+    target_include_directories(axi_api_v1_sysc_accv3 PUBLIC $ENV{SYSTEMC_HOME}/include/)
+    set_target_properties(axi_api_v1_sysc_accv3 PROPERTIES COMPILE_FLAGS "-DACC_V3")
+    target_link_libraries(axi_api_v1_sysc_accv3 PUBLIC $ENV{SYSTEMC_HOME}/lib-linux64/libsystemc.a)
+
+    # Same for accelerator v4
+    add_mlir_library(axi_api_v1_sysc_accv4
+      SHARED
+      api_v1_sysc.cpp
+
+      EXCLUDE_FROM_LIBMLIR
+    )
+    target_include_directories(axi_api_v1_sysc_accv4 PUBLIC $ENV{SYSTEMC_HOME}/include/)
+    set_target_properties(axi_api_v1_sysc_accv4 PROPERTIES COMPILE_FLAGS "-DACC_V4")
+    target_link_libraries(axi_api_v1_sysc_accv4 PUBLIC $ENV{SYSTEMC_HOME}/lib-linux64/libsystemc.a)
+
+    # ----------------------------
+    # Conv accelerator
+
+    # Same for accelerator v1
+    add_mlir_library(axi_api_v1_sysc_conv_accv1
+      SHARED
+      api_v1_sysc.cpp
+
+      EXCLUDE_FROM_LIBMLIR
+    )
+    target_include_directories(axi_api_v1_sysc_conv_accv1 PUBLIC $ENV{SYSTEMC_HOME}/include/)
+    set_target_properties(axi_api_v1_sysc_conv_accv1 PROPERTIES COMPILE_FLAGS "-DCONV_V1")
+    target_link_libraries(axi_api_v1_sysc_conv_accv1 PUBLIC $ENV{SYSTEMC_HOME}/lib-linux64/libsystemc.a)
+  endif()
+endif()
+
+set(CMAKE_CXX_FLAGS "${FIXED}")
+
+# No additional properties for now
+# target_compile_definitions(axi_api_v1 PRIVATE axi_api_EXPORTS)
\ No newline at end of file
diff --git a/lib/ExecutionEngine/axi/api/api_v0.cpp b/lib/ExecutionEngine/axi/api/api_v0.cpp
new file mode 100644
index 0000000..a07110d
--- /dev/null
+++ b/lib/ExecutionEngine/axi/api/api_v0.cpp
@@ -0,0 +1,48 @@
+//**********************Deprecated**********************
+
+#include "mlir/ExecutionEngine/axi/api_v0.h"
+
+void dma::init(int id) {
+  dma_set(dma_address, S2MM_CONTROL_REGISTER, 4);
+  dma_set(dma_address, MM2S_CONTROL_REGISTER, 4);
+  dma_set(dma_address, S2MM_CONTROL_REGISTER, 0);
+  dma_set(dma_address, MM2S_CONTROL_REGISTER, 0);
+  dma_set(dma_address, S2MM_DESTINATION_ADDRESS,
+          (unsigned long)dma_output_addr); // Write destination address
+  dma_set(dma_address, MM2S_START_ADDRESS,
+          (unsigned long)dma_input_addr); // Write source address
+  dma_set(dma_address, S2MM_CONTROL_REGISTER, 0xf001);
+  dma_set(dma_address, MM2S_CONTROL_REGISTER, 0xf001);
+}
+
+void dma_collection::dma_init(int dma_count, unsigned int *dma_address,
+                              unsigned int *dma_input_addr,
+                              unsigned int *dma_input_len,
+                              unsigned int *dma_output_addr,
+                              unsigned int *dma_output_len) {
+  // Open /dev/mem which represents the whole physical memory
+  int dh = open("/dev/mem", O_RDWR | O_SYNC);
+  dma_list = new dma[dma_count];
+  int id_count = 0;
+
+  for (int i = 0; i < dma_count; i++) {
+    void *dma_mm = mmap(NULL, 65535, PROT_READ | PROT_WRITE, MAP_SHARED, dh,
+                        dma_address[i]); // Memory map AXI Lite register block
+    void *dma_in_mm =
+        mmap(NULL, dma_input_len[i], PROT_READ | PROT_WRITE, MAP_SHARED, dh,
+             dma_input_addr[i]); // Memory map source address
+    void *dma_out_mm =
+        mmap(NULL, dma_output_len[i], PROT_READ, MAP_SHARED, dh,
+             dma_output_addr[i]); // Memory map destination address
+    unsigned int *dma_addr = reinterpret_cast<unsigned int *>(dma_mm);
+    unsigned int *dma_in = reinterpret_cast<unsigned int *>(dma_in_mm);
+    unsigned int *dma_out = reinterpret_cast<unsigned int *>(dma_out_mm);
+
+    dma_list[i].dma_address = dma_addr;
+    dma_list[i].dma_input_addr = dma_in;
+    dma_list[i].dma_output_addr = dma_out;
+    dma_list[i].dma_input_len = dma_input_len[i];
+    dma_list[i].dma_output_len = dma_output_len[i];
+    dma_list[i].init(id_count++);
+  }
+}
\ No newline at end of file
diff --git a/lib/ExecutionEngine/axi/api/api_v1.cpp b/lib/ExecutionEngine/axi/api/api_v1.cpp
new file mode 100644
index 0000000..5583200
--- /dev/null
+++ b/lib/ExecutionEngine/axi/api/api_v1.cpp
@@ -0,0 +1,653 @@
+//===- api_v1.cpp - AXI core API implementation ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the core functions to use the AXI DMA interface.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/ExecutionEngine/axi/api_v1.h"
+
+#ifdef __arm__
+#include "arm_neon.h"
+#endif
+
+void dma::dma_init(unsigned int _dma_address, unsigned int _dma_input_address,
+                   unsigned int _dma_input_buffer_size,
+                   unsigned int _dma_output_address,
+                   unsigned int _dma_output_buffer_size) {
+  int dh = open("/dev/mem", O_RDWR | O_SYNC);
+  void *dma_mm = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, dh,
+                      _dma_address); // Memory map AXI Lite register block
+  void *dma_in_mm =
+      mmap(NULL, _dma_input_buffer_size, PROT_READ | PROT_WRITE, MAP_SHARED, dh,
+           _dma_input_address); // Memory map source address
+  void *dma_out_mm =
+      mmap(NULL, _dma_output_buffer_size, PROT_READ, MAP_SHARED, dh,
+           _dma_output_address); // Memory map destination address
+  dma_address = reinterpret_cast<unsigned int *>(dma_mm);
+  dma_input_address = reinterpret_cast<unsigned int *>(dma_in_mm);
+  dma_output_address = reinterpret_cast<unsigned int *>(dma_out_mm);
+  dma_input_buffer_size = _dma_input_buffer_size;
+  dma_output_buffer_size = _dma_output_buffer_size;
+  dma_input_paddress = _dma_input_address;
+  dma_output_paddress = _dma_output_address;
+  current_input_offset = 0;
+  close(dh);
+  initDMAControls(); // Causes Segfault atm
+  LOG("DMA Initialised");
+}
+
+void dma::dma_free() {
+  munmap(dma_input_address, dma_input_buffer_size);
+  munmap(dma_output_address, dma_output_buffer_size);
+  munmap(dma_address, getpagesize());
+}
+
+// We could reduce to one set of the following calls
+//==============================================================================
+unsigned int *dma::dma_get_inbuffer() { return dma_input_address; }
+
+unsigned int *dma::dma_get_outbuffer() { return dma_output_address; }
+//==============================================================================
+int dma::dma_copy_to_inbuffer(unsigned int *src_address, int data_length,
+                              int offset) {
+  m_assert("data copy will overflow input buffer",
+           (unsigned int)(offset + data_length) <= dma_input_buffer_size);
+  std::memcpy(dma_input_address + offset, src_address, data_length * 4);
+  current_input_offset += data_length;
+  return 0;
+}
+
+int dma::dma_copy_from_outbuffer(unsigned int *dst_address, int data_length,
+                                 int offset) {
+  m_assert("tries to access data outwith the output buffer",
+           (unsigned int)(offset + data_length) <= dma_output_buffer_size);
+  std::memcpy(dst_address, dma_output_address + offset, data_length * 4);
+  return 0;
+}
+
+template <typename T>
+inline void copy_memref_to_array(T *mr_base, int64_t mr_dim, int64_t mr_rank,
+                                 int64_t mr_offset, const int64_t *mr_sizes,
+                                 const int64_t *mr_strides,
+                                 unsigned int *dst_base, const int dst_offset) {
+  int64_t rank = mr_rank;
+  // Handle empty shapes -> nothing to copy.
+  for (int rankp = 0; rankp < rank; ++rankp)
+    if (mr_sizes[rankp] == 0)
+      return;
+
+  T *srcPtr;
+  srcPtr = mr_base + mr_offset;
+
+  T *dstPtr;
+  dstPtr = reinterpret_cast<T *>(dst_base) + dst_offset;
+
+  if (rank == 0) {
+    // memcpy(dstPtr, srcPtr, elemSize); // broken
+    *dstPtr = *srcPtr; // opt 1
+    // *dstPtr = mr_base[mr_offset]; // opt 2
+    // dst_base[dst_offset] = mr_base[mr_offset]; // opt 3
+    return;
+  }
+
+  int64_t *indices = static_cast<int64_t *>(alloca(sizeof(int64_t) * rank));
+  int64_t *srcStrides = static_cast<int64_t *>(alloca(sizeof(int64_t) * rank));
+  int64_t *dstStrides = static_cast<int64_t *>(alloca(sizeof(int64_t) * rank));
+
+  // Initialize index and scale strides.
+  for (int rankp = 0; rankp < rank; ++rankp) {
+    indices[rankp] = 0;
+    srcStrides[rankp] = mr_strides[rankp];
+
+    // dstStrides for the array is derived from the input mr_sizes
+    // if the rank is 3, and the mr_sizes are 4x8x16, the dstStrides are
+    // 128x16x1
+    dstStrides[rankp] = 1;
+    for (int rankp2 = rankp + 1; rankp2 < rank; ++rankp2) {
+      dstStrides[rankp] *= mr_sizes[rankp2];
+    }
+  }
+
+  // DEBUG:
+  // std::cout << "INFO copy_memref_to_array: rank: " << rank << std::endl;
+  // std::cout << "INFO copy_memref_to_array: offset: " << mr_offset <<
+  // std::endl; std::cout << "INFO copy_memref_to_array: sizes: "; for (int
+  // rankp = 0; rankp < rank; ++rankp) {
+  //   std::cout << mr_sizes[rankp] << " ";
+  // }
+  // std::cout << std::endl;
+  // std::cout << "INFO copy_memref_to_array: strides: ";
+  // for (int rankp = 0; rankp < rank; ++rankp) {
+  //   std::cout << mr_strides[rankp] << " ";
+  // }
+  // std::cout << std::endl;
+
+#ifdef __arm__SKIP
+  // std::cout << "Enter_NEON _ test" << std::endl;
+  if (rank == 2 && mr_strides[rank - 1] == 1) {
+    int64_t size = mr_sizes[rank - 1];        // number of elements
+    int64_t count = mr_sizes[rank - 2];       // number of rows
+    int64_t srcStride = mr_strides[rank - 2]; // stride between rows
+    int64_t dstStride = dstStrides[rank - 2]; // stride between rows
+    const int64_t elemSize = sizeof(T);
+
+    int32x4_t tmp0;
+    int32x4_t tmp1;
+    int32x4_t tmp2;
+    int32x4_t tmp3;
+
+    int64_t sizer_16r = (size % 16);
+    int64_t sizer_8r = (size % 8);
+    int64_t sizer_4r = (size % 4);
+
+    if (sizer_16r == 0) {
+      // std::cout << "Enter_NEON _ 16" << std::endl;
+      for (int64_t i = 0; i < count; ++i) {
+        int64_t j = 0;
+        for (; j < size; j += 16) {
+          // neon vector load and store
+          tmp0 = vld1q_s32(reinterpret_cast<int32_t *>(srcPtr) + j + 0);
+          tmp1 = vld1q_s32(reinterpret_cast<int32_t *>(srcPtr) + j + 4);
+          tmp2 = vld1q_s32(reinterpret_cast<int32_t *>(srcPtr) + j + 8);
+          tmp3 = vld1q_s32(reinterpret_cast<int32_t *>(srcPtr) + j + 12);
+          vst1q_s32(reinterpret_cast<int32_t *>(dstPtr) + j + 0, tmp0);
+          vst1q_s32(reinterpret_cast<int32_t *>(dstPtr) + j + 4, tmp1);
+          vst1q_s32(reinterpret_cast<int32_t *>(dstPtr) + j + 8, tmp2);
+          vst1q_s32(reinterpret_cast<int32_t *>(dstPtr) + j + 12, tmp3);
+        }
+        srcPtr += srcStride;
+        dstPtr += dstStride;
+      }
+    } else if (sizer_8r == 0) {
+      // std::cout << "Enter_NEON _ 8" << std::endl;
+      for (int64_t i = 0; i < count; ++i) {
+        int64_t j = 0;
+        for (; j < size; j += 8) {
+          // neon vector load and store
+          tmp0 = vld1q_s32(reinterpret_cast<int32_t *>(srcPtr) + j + 0);
+          tmp1 = vld1q_s32(reinterpret_cast<int32_t *>(srcPtr) + j + 4);
+          vst1q_s32(reinterpret_cast<int32_t *>(dstPtr) + j + 0, tmp0);
+          vst1q_s32(reinterpret_cast<int32_t *>(dstPtr) + j + 4, tmp1);
+        }
+        srcPtr += srcStride;
+        dstPtr += dstStride;
+      }
+    } else if (sizer_4r == 0) {
+      // std::cout << "Enter_NEON _ 4" << std::endl;
+      for (int64_t i = 0; i < count; ++i) {
+        int64_t j = 0;
+        for (; j < size; j += 4) {
+          // neon vector load and store
+          tmp0 = vld1q_s32(reinterpret_cast<int32_t *>(srcPtr) + j + 0);
+          vst1q_s32(reinterpret_cast<int32_t *>(dstPtr) + j + 0, tmp0);
+        }
+        srcPtr += srcStride;
+        dstPtr += dstStride;
+      }
+    } else {
+      // std::cout << "Enter_NEON _ 1" << std::endl;
+      for (int64_t i = 0; i < count; ++i) {
+        memcpy(dstPtr, srcPtr, size * elemSize);
+        srcPtr += srcStride;
+        dstPtr += dstStride;
+      }
+    }
+    return;
+  }
+#else
+  // create a special case for rank==2 and strides[rank-1]==1 using memcpy
+  if (rank == 2 && mr_strides[rank - 1] == 1) {
+    int64_t size = mr_sizes[rank - 1];        // number of elements
+    int64_t count = mr_sizes[rank - 2];       // number of rows
+    int64_t srcStride = mr_strides[rank - 2]; // stride between rows
+    int64_t dstStride = dstStrides[rank - 2]; // stride between rows
+    const int64_t elemSize = sizeof(T);
+    for (int64_t i = 0; i < count; ++i) {
+      // std::cout << "INFO copy_memref_to_array: memcpy: " << dstPtr << " " <<
+      // srcPtr << " " << size * elemSize << std::endl;
+      memcpy(dstPtr, srcPtr, size * elemSize); // broken
+      srcPtr += srcStride;
+      dstPtr += dstStride;
+    }
+    return;
+  }
+#endif
+
+  int64_t volatile readIndex = 0;
+  int64_t volatile writeIndex = 0;
+  for (;;) {
+    D(std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ << "offset]"
+                << dst_offset << "\n";
+      std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ << "SRC]"
+                << srcPtr << "\n";
+      std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ << "DST]"
+                << dstPtr << "\n";
+      std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__
+                << "load from]" << srcPtr + readIndex << "\n";
+      std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__
+                << "store at]" << dstPtr + writeIndex << "\n";
+      std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__
+                << "loaded val]" << *(srcPtr + readIndex) << "\n";
+      std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__
+                << "stored val]" << *(dstPtr + writeIndex) << "\n";);
+
+    // TODO: Try option 1 again
+    // NOTE: broken memcpy could have been a result of implicit casting
+    //       due to type mismatch
+
+    // Copy over the element, byte by byte.
+    // memcpy(dstPtr + writeIndex, srcPtr + readIndex, elemSize); // broken
+    *(dstPtr + writeIndex) = *(srcPtr + readIndex); // opt 1
+    // *(dstPtr +writeIndex) = mr_base[mr_offset +readIndex]; // opt 2
+    // dst_base[dst_offset+writeIndex] = mr_base[mr_offset +readIndex]; // opt 3
+
+    // Advance index and read position.
+    for (int64_t axis = rank - 1; axis >= 0; --axis) {
+      // Advance at current axis.
+      auto newIndex = ++indices[axis];
+      readIndex += srcStrides[axis];
+      writeIndex += 1; // Always increment, it is a flattened dense array
+      // If this is a valid index, we have our next index, so continue copying.
+      if (mr_sizes[axis] != newIndex)
+        break;
+      // We reached the end of this axis. If this is axis 0, we are done.
+      if (axis == 0)
+        return;
+      // Else, reset to 0 and undo the advancement of the linear index that
+      // this axis had. Then continue with the axis one outer.
+      indices[axis] = 0;
+      readIndex -= mr_sizes[axis] * srcStrides[axis];
+      // We arrived in the last element of the current axis, we must decrement
+      // writeIndex by 1 to fix the additional inc without write of this
+      // iteration`
+      writeIndex -= 1;
+    }
+  }
+}
+
+// Implements the actual copy
+template <typename T>
+int dma::mlir_dma_copy_to_inbuffer(T *mr_base, int64_t mr_dim, int64_t mr_rank,
+                                   int64_t mr_offset, const int64_t *mr_sizes,
+                                   const int64_t *mr_strides, int dma_offset) {
+  D(std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ << "]\n";);
+
+  copy_memref_to_array(mr_base, mr_dim, mr_rank, mr_offset, mr_sizes,
+                       mr_strides, dma_get_inbuffer(), dma_offset);
+
+  return 0;
+}
+
+template <typename T>
+inline void copy_array_to_memref(T *mr_base, int64_t mr_dim, int64_t mr_rank,
+                                 int64_t mr_offset, const int64_t *mr_sizes,
+                                 const int64_t *mr_strides,
+                                 unsigned int *src_base, const int src_offset) {
+  int64_t rank = mr_rank;
+  // Handle empty shapes -> nothing to copy.
+  for (int rankp = 0; rankp < rank; ++rankp)
+    if (mr_sizes[rankp] == 0)
+      return;
+
+  T *dstPtr;
+  dstPtr = mr_base + mr_offset;
+
+  T *srcPtr;
+  srcPtr = reinterpret_cast<T *>(src_base) + src_offset;
+
+  if (rank == 0) {
+    // memcpy(dstPtr, srcPtr, elemSize); // broken
+    *dstPtr = *srcPtr; // opt 1
+    // *dstPtr = mr_base[mr_offset]; // opt 2
+    // dst_base[dst_offset] = mr_base[mr_offset]; // opt 3
+    return;
+  }
+
+  int64_t *indices = static_cast<int64_t *>(alloca(sizeof(int64_t) * rank));
+  int64_t *srcStrides = static_cast<int64_t *>(alloca(sizeof(int64_t) * rank));
+  int64_t *dstStrides = static_cast<int64_t *>(alloca(sizeof(int64_t) * rank));
+
+  // Initialize index and scale strides.
+  for (int rankp = 0; rankp < rank; ++rankp) {
+    indices[rankp] = 0;
+    dstStrides[rankp] = mr_strides[rankp];
+
+    // srcStrides for the array is derived from the output mr_sizes
+    // if the rank is 3, and the mr_sizes are 4x8x16, the srcStrides are
+    // 128x16x1
+    srcStrides[rankp] = 1;
+    for (int rankp2 = rankp + 1; rankp2 < rank; ++rankp2) {
+      srcStrides[rankp] *= mr_sizes[rankp2];
+    }
+  }
+
+  // DEBUG:
+  // std::cout << "INFO copy_memref_to_array: rank: " << rank << std::endl;
+  // std::cout << "INFO copy_memref_to_array: offset: " << mr_offset <<
+  // std::endl; std::cout << "INFO copy_memref_to_array: sizes: "; for (int
+  // rankp = 0; rankp < rank; ++rankp) {
+  //   std::cout << mr_sizes[rankp] << " ";
+  // }
+  // std::cout << std::endl;
+  // std::cout << "INFO copy_memref_to_array: strides: ";
+  // for (int rankp = 0; rankp < rank; ++rankp) {
+  //   std::cout << mr_strides[rankp] << " ";
+  // }
+  // std::cout << std::endl;
+
+#ifdef __arm__SKIP
+  // std::cout << "Enter_NEON _ test" << std::endl;
+  if (rank == 2 && mr_strides[rank - 1] == 1) {
+    int64_t size = mr_sizes[rank - 1];        // number of elements
+    int64_t count = mr_sizes[rank - 2];       // number of rows
+    int64_t srcStride = mr_strides[rank - 2]; // stride between rows
+    int64_t dstStride = dstStrides[rank - 2]; // stride between rows
+    const int64_t elemSize = sizeof(T);
+
+    int32x4_t tmp0;
+    int32x4_t tmp1;
+    int32x4_t tmp2;
+    int32x4_t tmp3;
+
+    int64_t sizer_16r = (size % 16);
+    int64_t sizer_8r = (size % 8);
+    int64_t sizer_4r = (size % 4);
+
+    if (sizer_16r == 0) {
+      // std::cout << "Enter_NEON _ 16" << std::endl;
+      for (int64_t i = 0; i < count; ++i) {
+        int64_t j = 0;
+        for (; j < size; j += 16) {
+          // neon vector load and store
+          tmp0 = vld1q_s32(reinterpret_cast<int32_t *>(srcPtr) + j + 0);
+          tmp1 = vld1q_s32(reinterpret_cast<int32_t *>(srcPtr) + j + 4);
+          tmp2 = vld1q_s32(reinterpret_cast<int32_t *>(srcPtr) + j + 8);
+          tmp3 = vld1q_s32(reinterpret_cast<int32_t *>(srcPtr) + j + 12);
+          vst1q_s32(reinterpret_cast<int32_t *>(dstPtr) + j + 0, tmp0);
+          vst1q_s32(reinterpret_cast<int32_t *>(dstPtr) + j + 4, tmp1);
+          vst1q_s32(reinterpret_cast<int32_t *>(dstPtr) + j + 8, tmp2);
+          vst1q_s32(reinterpret_cast<int32_t *>(dstPtr) + j + 12, tmp3);
+        }
+        srcPtr += srcStride;
+        dstPtr += dstStride;
+      }
+    } else if (sizer_8r == 0) {
+      // std::cout << "Enter_NEON _ 8" << std::endl;
+      for (int64_t i = 0; i < count; ++i) {
+        int64_t j = 0;
+        for (; j < size; j += 8) {
+          // neon vector load and store
+          tmp0 = vld1q_s32(reinterpret_cast<int32_t *>(srcPtr) + j + 0);
+          tmp1 = vld1q_s32(reinterpret_cast<int32_t *>(srcPtr) + j + 4);
+          vst1q_s32(reinterpret_cast<int32_t *>(dstPtr) + j + 0, tmp0);
+          vst1q_s32(reinterpret_cast<int32_t *>(dstPtr) + j + 4, tmp1);
+        }
+        srcPtr += srcStride;
+        dstPtr += dstStride;
+      }
+    } else if (sizer_4r == 0) {
+      // std::cout << "Enter_NEON _ 4" << std::endl;
+      for (int64_t i = 0; i < count; ++i) {
+        int64_t j = 0;
+        for (; j < size; j += 4) {
+          // neon vector load and store
+          tmp0 = vld1q_s32(reinterpret_cast<int32_t *>(srcPtr) + j + 0);
+          vst1q_s32(reinterpret_cast<int32_t *>(dstPtr) + j + 0, tmp0);
+        }
+        srcPtr += srcStride;
+        dstPtr += dstStride;
+      }
+    } else {
+      // std::cout << "Enter_NEON _ 1" << std::endl;
+      for (int64_t i = 0; i < count; ++i) {
+        memcpy(dstPtr, srcPtr, size * elemSize);
+        srcPtr += srcStride;
+        dstPtr += dstStride;
+      }
+    }
+    return;
+  }
+#else
+  // create a special case for rank==2 and mr_strides[rank-1]==1 using memcpy
+  if (rank == 2 && mr_strides[rank - 1] == 1) {
+    int64_t size = mr_sizes[rank - 1];  // number of elements in one row
+    int64_t nRows = mr_sizes[rank - 2]; // number of rows
+    int64_t dstStride =
+        mr_strides[rank - 2]; // #elements to skip to access next row
+    int64_t srcStride =
+        srcStrides[rank - 2]; // #elements to skip to access next row
+    const int64_t elemSize = sizeof(T);
+    for (int64_t i = 0; i < nRows; ++i) {
+      // std::cout << "INFO copy_memref_to_array: memcpy: " << dstPtr << " " <<
+      // srcPtr << " " << size * elemSize << std::endl;
+      memcpy(dstPtr, srcPtr, size * elemSize); // broken
+      srcPtr += srcStride;
+      dstPtr += dstStride;
+    }
+    return;
+  }
+#endif
+
+  int64_t volatile readIndex = 0;
+  int64_t volatile writeIndex = 0;
+  for (;;) {
+    D(std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ << "offset]"
+                << src_offset << "\n";
+      std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ << "SRC]"
+                << srcPtr << "\n";
+      std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ << "DST]"
+                << dstPtr << "\n";
+      std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__
+                << "load from]" << srcPtr + readIndex << "\n";
+      std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__
+                << "store at]" << dstPtr + writeIndex << "\n";
+      std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__
+                << "loaded val]" << *(srcPtr + readIndex) << "\n";
+      std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__
+                << "stored val]" << *(dstPtr + writeIndex) << "\n";);
+
+    // TODO: Try option 1 again
+    // NOTE: broken memcpy could have been a result of implicit casting
+    //       due to type mismatch
+
+    // Copy over the element, byte by byte.
+    // memcpy(dstPtr + writeIndex, srcPtr + readIndex, elemSize); // broken
+    *(dstPtr + writeIndex) = *(srcPtr + readIndex); // opt 1
+    // *(dstPtr +writeIndex) = mr_base[mr_offset +readIndex]; // opt 2
+    // dst_base[dst_offset+writeIndex] = mr_base[mr_offset +readIndex]; // opt 3
+
+    // Advance index and read position.
+    for (int64_t axis = rank - 1; axis >= 0; --axis) {
+      // Advance at current axis.
+      auto newIndex = ++indices[axis];
+      writeIndex += dstStrides[axis];
+      readIndex += 1; // Always increment, it is a flattened dense array
+
+      // If this is a valid index, we have our next index, so continue copying.
+      if (mr_sizes[axis] != newIndex)
+        break;
+      // We reached the end of this axis. If this is axis 0, we are done.
+      if (axis == 0)
+        return;
+      // Else, reset to 0 and undo the advancement of the linear index that
+      // this axis had. Then continue with the axis one outer.
+      indices[axis] = 0;
+      writeIndex -= mr_sizes[axis] * dstStrides[axis];
+      // We arrived in the last element of the current axis, we must decrement
+      // writeIndex by 1 to fix the additional inc without write of this
+      // iteration`
+      readIndex -= 1;
+    }
+  }
+}
+
+template <typename T>
+int dma::mlir_dma_copy_from_outbuffer(T *mr_base, int64_t mr_dim,
+                                      int64_t mr_rank, int64_t mr_offset,
+                                      const int64_t *mr_sizes,
+                                      const int64_t *mr_strides,
+                                      int dma_offset) {
+
+  D(std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ << "]\n";);
+
+  copy_array_to_memref(mr_base, mr_dim, mr_rank, mr_offset, mr_sizes,
+                       mr_strides, dma_get_outbuffer(), dma_offset);
+
+  return 0;
+}
+
+// Make templates concrete:
+template int dma::mlir_dma_copy_to_inbuffer<float>(
+    float *mr_base, int64_t mr_dim, int64_t mr_rank, int64_t mr_offset,
+    const int64_t *mr_sizes, const int64_t *mr_strides, int dma_offset);
+
+template int dma::mlir_dma_copy_to_inbuffer<int>(
+    int *mr_base, int64_t mr_dim, int64_t mr_rank, int64_t mr_offset,
+    const int64_t *mr_sizes, const int64_t *mr_strides, int dma_offset);
+
+template int dma::mlir_dma_copy_from_outbuffer<float>(
+    float *mr_base, int64_t mr_dim, int64_t mr_rank, int64_t mr_offset,
+    const int64_t *mr_sizes, const int64_t *mr_strides, int dma_offset);
+
+template int dma::mlir_dma_copy_from_outbuffer<int>(
+    int *mr_base, int64_t mr_dim, int64_t mr_rank, int64_t mr_offset,
+    const int64_t *mr_sizes, const int64_t *mr_strides, int dma_offset);
+
+int dma::dma_start_send(int length, int offset) {
+  m_assert("trying to send data outside the input buffer",
+           (unsigned int)(offset + length) <= dma_input_buffer_size);
+  dma_set(dma_address, MM2S_START_ADDRESS, dma_input_paddress + (offset * 4));
+  msync(dma_address, PAGE_SIZE, MS_SYNC);
+  dma_set(dma_address, MM2S_LENGTH, length * 4);
+  LOG("Transfer Started - " << length * 4);
+  current_input_offset = 0;
+  return 0;
+}
+
+void dma::dma_wait_send() {
+  LOG("Data Transfer - Waiting");
+  dma_mm2s_sync();
+  LOG("Data Transfer - Done");
+}
+
+int dma::dma_check_send() {
+  unsigned int mm2s_status = dma_get(dma_address, MM2S_STATUS_REGISTER);
+  bool done = !((!(mm2s_status & 1 << 12)) || (!(mm2s_status & 1 << 1)));
+  if (done) {
+    LOG("Data Transfer - Done");
+  } else {
+    LOG("Data Transfer - Not Done");
+  }
+  return done ? 0 : -1;
+}
+
+int dma::dma_start_recv(int length, int offset) {
+  m_assert("trying receive data outside the output buffer",
+           (unsigned int)(offset + length) <= dma_output_buffer_size);
+  dma_set(dma_address, S2MM_DESTINATION_ADDRESS,
+          dma_output_paddress + (offset * 4));
+  msync(dma_address, PAGE_SIZE, MS_SYNC);
+  LOG("Started Receiving " << length * 4);
+  dma_set(dma_address, S2MM_LENGTH, length * 4);
+  LOG("Started Receiving " << length * 4);
+  return 0;
+}
+
+void dma::dma_wait_recv() {
+  LOG("Data Receive - Waiting");
+  LOG("Data Receive - Waiting " << dma_get(dma_address, S2MM_LENGTH));
+  dma_s2mm_sync();
+  // unsigned int recv_len =  dma_get(dma_address,S2MM_LENGTH);
+  LOG("Data Receive - Done " << dma_get(dma_address, S2MM_LENGTH));
+}
+
+int dma::dma_check_recv() {
+  unsigned int s2mm_status = dma_get(dma_address, S2MM_STATUS_REGISTER);
+  bool done = !((!(s2mm_status & 1 << 12)) || (!(s2mm_status & 1 << 1)));
+  if (done) {
+    LOG("Data Receive - Done");
+  } else {
+    LOG("Data Receive - Not Done");
+  }
+  return done ? 0 : -1;
+}
+
+//********************************** Unexposed Functions
+//**********************************
+void dma::initDMAControls() {
+  dma_set(dma_address, S2MM_CONTROL_REGISTER, 4);
+  dma_set(dma_address, MM2S_CONTROL_REGISTER, 4);
+  dma_set(dma_address, S2MM_CONTROL_REGISTER, 0);
+  dma_set(dma_address, MM2S_CONTROL_REGISTER, 0);
+  // dma_set(dma_address, S2MM_DESTINATION_ADDRESS,
+  //         (unsigned long)dma_output_address); // Write destination address
+  // dma_set(dma_address, MM2S_START_ADDRESS,
+  //         (unsigned long)dma_input_address); // Write source address
+  dma_set(dma_address, S2MM_DESTINATION_ADDRESS,
+          dma_output_paddress); // Write destination address
+  dma_set(dma_address, MM2S_START_ADDRESS,
+          dma_input_paddress); // Write source address
+  dma_set(dma_address, S2MM_CONTROL_REGISTER, 0xf001);
+  dma_set(dma_address, MM2S_CONTROL_REGISTER, 0xf001);
+}
+
+void dma::dma_set(unsigned int *dma_address, int offset, unsigned int value) {
+  *((volatile unsigned int *)(reinterpret_cast<char *>(dma_address) + offset)) =
+      value;
+  // dma_address[offset >> 2] = value;
+}
+
+unsigned int dma::dma_get(unsigned int *dma_address, int offset) {
+  return *((volatile unsigned int *)(reinterpret_cast<char *>(dma_address) +
+                                     offset));
+  // return *((volatile unsigned int*) dma_address[offset >> 2]);
+  // return dma_address[offset >> 2];
+}
+
+void dma::dma_mm2s_sync() {
+  msync(dma_address, PAGE_SIZE, MS_SYNC);
+  unsigned int mm2s_status = dma_get(dma_address, MM2S_STATUS_REGISTER);
+  while (!(mm2s_status & 1 << 12) || !(mm2s_status & 1 << 1)) {
+    msync(dma_address, PAGE_SIZE, MS_SYNC);
+    mm2s_status = dma_get(dma_address, MM2S_STATUS_REGISTER);
+  }
+}
+
+void dma::dma_s2mm_sync() {
+  msync(dma_address, PAGE_SIZE, MS_SYNC);
+  unsigned int s2mm_status = dma_get(dma_address, S2MM_STATUS_REGISTER);
+  while (!(s2mm_status & 1 << 12) || !(s2mm_status & 1 << 1)) {
+    msync(dma_address, PAGE_SIZE, MS_SYNC);
+    s2mm_status = dma_get(dma_address, S2MM_STATUS_REGISTER);
+  }
+}
+
+void dma::acc_init(unsigned int base_addr, int length) {
+  int dh = open("/dev/mem", O_RDWR | O_SYNC);
+  size_t virt_base = base_addr & ~(PAGE_SIZE - 1);
+  size_t virt_offset = base_addr - virt_base;
+  void *addr = mmap(NULL, length + virt_offset, PROT_READ | PROT_WRITE,
+                    MAP_SHARED, dh, virt_base);
+  close(dh);
+  if (addr == (void *)-1)
+    exit(EXIT_FAILURE);
+  acc_address = reinterpret_cast<unsigned int *>(addr);
+}
+
+void dma::dump_acc_signals(int state) {
+  msync(acc_address, PAGE_SIZE, MS_SYNC);
+  std::ofstream file;
+  file.open("dump_acc_signals.dat", std::ios_base::app);
+  file << "====================================================" << std::endl;
+  file << "State: " << state << std::endl;
+  file << "====================================================" << std::endl;
+  for (int i = 0; i < 16; i++)
+    file << acc_address[i] << ",";
+  file << "====================================================" << std::endl;
+}
\ No newline at end of file
diff --git a/lib/ExecutionEngine/axi/api/api_v1_sysc.cpp b/lib/ExecutionEngine/axi/api/api_v1_sysc.cpp
new file mode 100644
index 0000000..24faaca
--- /dev/null
+++ b/lib/ExecutionEngine/axi/api/api_v1_sysc.cpp
@@ -0,0 +1,470 @@
+//===- api_v1.cpp - AXI core API implementation ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the core functions to use the AXI DMA interface.
+//
+//===----------------------------------------------------------------------===//
+
+#define SYSC
+#include "mlir/ExecutionEngine/axi/api_v1.h"
+
+int sc_main(int argc, char *argv[]) { return 0; }
+
+// SystemC code does not require all these parameters
+void dma::dma_init(unsigned int _dma_address, unsigned int _dma_input_address,
+                   unsigned int _dma_input_buffer_size,
+                   unsigned int _dma_output_address,
+                   unsigned int _dma_output_buffer_size) {
+
+  sc_report_handler::set_actions("/IEEE_Std_1666/deprecated", SC_DO_NOTHING);
+  sc_report_handler::set_actions(SC_ID_LOGIC_X_TO_BOOL_, SC_LOG);
+  sc_report_handler::set_actions(SC_ID_VECTOR_CONTAINS_LOGIC_VALUE_, SC_LOG);
+
+  dma_input_address =
+      (unsigned int *)malloc(_dma_input_buffer_size * sizeof(int));
+  dma_output_address =
+      (unsigned int *)malloc(_dma_output_buffer_size * sizeof(int));
+
+  // Initialize with zeros
+  for (int64_t i = 0; i < _dma_input_buffer_size; i++) {
+    *(dma_input_address + i) = 0;
+  }
+
+  for (int64_t i = 0; i < _dma_output_buffer_size; i++) {
+    *(dma_output_address + i) = 0;
+  }
+
+  static ACCNAME dut("dut");
+  static DMA_DRIVER dm("DMA");
+  accelerator_dma_connect<int>(&dut, &dm, _dma_input_buffer_size,
+                               _dma_output_buffer_size);
+
+  dm.DMA_input_buffer = (int *)dma_input_address;
+  dm.DMA_output_buffer = (int *)dma_output_address;
+  dma_input_buffer_size = _dma_input_buffer_size;
+  dma_output_buffer_size = _dma_output_buffer_size;
+
+  acc = &dut;
+  dmad = &dm;
+  acc->verbose = verbose;
+  LOG("SystemC dma_init() initializes the DMA");
+}
+
+void dma::dma_free() {
+  LOG("SystemC dma_free() deallocates DMA buffers");
+  LOG("++++++++++++++++++++++++++++++++++++++++");
+  LOG("SystemC simulated cycles: " << sc_time_stamp());
+  LOG("DMA Send count: " << dma_send_count);
+  LOG("DMA Send length: " << dma_send_length);
+  LOG("DMA Recv count: " << dma_recv_count);
+  LOG("DMA Recv length: " << dma_recv_length);
+  LOG("++++++++++++++++++++++++++++++++++++++++");
+  acc->print_profile();
+
+  free(dma_input_address);
+  free(dma_output_address);
+}
+
+unsigned int *dma::dma_get_inbuffer() { return dma_input_address; }
+
+unsigned int *dma::dma_get_outbuffer() { return dma_output_address; }
+
+int dma::dma_copy_to_inbuffer(unsigned int *src_address, int data_length,
+                              int offset) {
+  LOG("SystemC dma_copy_to_inbuffer()");
+  m_assert("data copy will overflow input buffer",
+           (unsigned int)(offset + data_length) <= dma_input_buffer_size);
+  memcpy((dma_get_inbuffer() + offset), src_address, data_length * 4);
+  return 0;
+}
+
+int dma::dma_copy_from_outbuffer(unsigned int *dst_address, int data_length,
+                                 int offset) {
+  LOG("SystemC dma_copy_from_outbuffer()");
+  m_assert("tries to access data out with the output buffer",
+           (unsigned int)(offset + data_length) <= dma_output_buffer_size);
+  memcpy(dst_address, (dma_get_outbuffer() + offset), data_length * 4);
+  return 0;
+}
+
+template <typename T>
+inline void copy_memref_to_array(T *mr_base, int64_t mr_dim, int64_t mr_rank,
+                                 int64_t mr_offset, const int64_t *mr_sizes,
+                                 const int64_t *mr_strides,
+                                 unsigned int *dst_base, const int dst_offset) {
+  int64_t rank = mr_rank;
+  // Handle empty shapes -> nothing to copy.
+  for (int rankp = 0; rankp < rank; ++rankp)
+    if (mr_sizes[rankp] == 0)
+      return;
+
+  T *srcPtr;
+  srcPtr = mr_base + mr_offset;
+
+  T *dstPtr;
+  dstPtr = reinterpret_cast<T *>(dst_base) + dst_offset;
+
+  if (rank == 0) {
+    // memcpy(dstPtr, srcPtr, elemSize); // broken
+    *dstPtr = *srcPtr; // opt 1
+    // *dstPtr = mr_base[mr_offset]; // opt 2
+    // dst_base[dst_offset] = mr_base[mr_offset]; // opt 3
+    return;
+  }
+
+  int64_t *indices = static_cast<int64_t *>(alloca(sizeof(int64_t) * rank));
+  int64_t *srcStrides = static_cast<int64_t *>(alloca(sizeof(int64_t) * rank));
+  int64_t *dstStrides = static_cast<int64_t *>(alloca(sizeof(int64_t) * rank));
+
+  // Initialize index and scale strides.
+  for (int rankp = 0; rankp < rank; ++rankp) {
+    indices[rankp] = 0;
+    srcStrides[rankp] = mr_strides[rankp];
+
+    // dstStrides for the array is derived from the input mr_sizes
+    // if the rank is 3, and the mr_sizes are 4x8x16, the dstStrides are
+    // 128x16x1
+    dstStrides[rankp] = 1;
+    for (int rankp2 = rankp + 1; rankp2 < rank; ++rankp2) {
+      dstStrides[rankp] *= mr_sizes[rankp2];
+    }
+  }
+
+  // DEBUG:
+  // std::cout << "INFO copy_memref_to_array: rank: " << rank << std::endl;
+  // std::cout << "INFO copy_memref_to_array: offset: " << mr_offset <<
+  // std::endl; std::cout << "INFO copy_memref_to_array: sizes: "; for (int
+  // rankp = 0; rankp < rank; ++rankp) {
+  //   std::cout << mr_sizes[rankp] << " ";
+  // }
+  // std::cout << std::endl;
+  // std::cout << "INFO copy_memref_to_array: strides: ";
+  // for (int rankp = 0; rankp < rank; ++rankp) {
+  //   std::cout << mr_strides[rankp] << " ";
+  // }
+  // std::cout << std::endl;
+
+  // create a special case for rank==2 and strides[rank-1]==1 using memcpy
+  if (rank == 2 && mr_strides[rank - 1] == 1) {
+    int64_t size = mr_sizes[rank - 1];        // number of elements
+    int64_t count = mr_sizes[rank - 2];       // number of rows
+    int64_t srcStride = mr_strides[rank - 2]; // stride between rows
+    int64_t dstStride = dstStrides[rank - 2]; // stride between rows
+    const int64_t elemSize = sizeof(T);
+    for (int64_t i = 0; i < count; ++i) {
+      // std::cout << "INFO copy_memref_to_array: memcpy: " << dstPtr << " " <<
+      // srcPtr << " " << size * elemSize << std::endl;
+      memcpy(dstPtr, srcPtr, size * elemSize); // broken
+      srcPtr += srcStride;
+      dstPtr += dstStride;
+    }
+    return;
+  }
+
+  int64_t volatile readIndex = 0;
+  int64_t volatile writeIndex = 0;
+  for (;;) {
+    // TODO: Try option 1 again
+    // NOTE: broken memcpy could have been a result of implicit casting
+    //       due to type mismatch
+
+    // Copy over the element, byte by byte.
+    // memcpy(dstPtr + writeIndex, srcPtr + readIndex, elemSize); // broken
+    *(dstPtr + writeIndex) = *(srcPtr + readIndex); // opt 1
+    // *(dstPtr +writeIndex) = mr_base[mr_offset +readIndex]; // opt 2
+    // dst_base[dst_offset+writeIndex] = mr_base[mr_offset +readIndex]; // opt 3
+
+    // Advance index and read position.
+    for (int64_t axis = rank - 1; axis >= 0; --axis) {
+      // Advance at current axis.
+      auto newIndex = ++indices[axis];
+      readIndex += srcStrides[axis];
+      writeIndex += 1; // Always increment, it is a flattened dense array
+      // If this is a valid index, we have our next index, so continue copying.
+      if (mr_sizes[axis] != newIndex)
+        break;
+      // We reached the end of this axis. If this is axis 0, we are done.
+      if (axis == 0)
+        return;
+      // Else, reset to 0 and undo the advancement of the linear index that
+      // this axis had. Then continue with the axis one outer.
+      indices[axis] = 0;
+      readIndex -= mr_sizes[axis] * srcStrides[axis];
+      // We arrived in the last element of the current axis, we must decrement
+      // writeIndex by 1 to fix the additional inc without write of this
+      // iteration`
+      writeIndex -= 1;
+    }
+  }
+}
+
+// Implements the actual copy
+template <typename T>
+int dma::mlir_dma_copy_to_inbuffer(T *mr_base, int64_t mr_dim, int64_t mr_rank,
+                                   int64_t mr_offset, const int64_t *mr_sizes,
+                                   const int64_t *mr_strides, int dma_offset) {
+  // std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ << "]\n";
+  LOG(__FILE__ << ": " << __LINE__ << " [" << __func__ << "]\n");
+  copy_memref_to_array(mr_base, mr_dim, mr_rank, mr_offset, mr_sizes,
+                       mr_strides, dma_get_inbuffer(), dma_offset);
+
+  return 0;
+}
+
+template <typename T>
+inline void copy_array_to_memref(T *mr_base, int64_t mr_dim, int64_t mr_rank,
+                                 int64_t mr_offset, const int64_t *mr_sizes,
+                                 const int64_t *mr_strides,
+                                 unsigned int *src_base, const int src_offset) {
+  int64_t rank = mr_rank;
+  // Handle empty shapes -> nothing to copy.
+  for (int rankp = 0; rankp < rank; ++rankp)
+    if (mr_sizes[rankp] == 0)
+      return;
+
+  T *dstPtr;
+  dstPtr = mr_base + mr_offset;
+
+  T *srcPtr;
+  srcPtr = reinterpret_cast<T *>(src_base) + src_offset;
+
+  if (rank == 0) {
+    // memcpy(dstPtr, srcPtr, elemSize); // broken
+    *dstPtr = *srcPtr; // opt 1
+    // *dstPtr = mr_base[mr_offset]; // opt 2
+    // dst_base[dst_offset] = mr_base[mr_offset]; // opt 3
+    return;
+  }
+
+  int64_t *indices = static_cast<int64_t *>(alloca(sizeof(int64_t) * rank));
+  int64_t *srcStrides = static_cast<int64_t *>(alloca(sizeof(int64_t) * rank));
+  int64_t *dstStrides = static_cast<int64_t *>(alloca(sizeof(int64_t) * rank));
+
+  // Initialize index and scale strides.
+  for (int rankp = 0; rankp < rank; ++rankp) {
+    indices[rankp] = 0;
+    dstStrides[rankp] = mr_strides[rankp];
+
+    // srcStrides for the array is derived from the output mr_sizes
+    // if the rank is 3, and the mr_sizes are 4x8x16, the srcStrides are
+    // 128x16x1
+    srcStrides[rankp] = 1;
+    for (int rankp2 = rankp + 1; rankp2 < rank; ++rankp2) {
+      srcStrides[rankp] *= mr_sizes[rankp2];
+    }
+  }
+
+  // DEBUG:
+  // std::cout << "INFO copy_memref_to_array: rank: " << rank << std::endl;
+  // std::cout << "INFO copy_memref_to_array: offset: " << mr_offset <<
+  // std::endl; std::cout << "INFO copy_memref_to_array: sizes: "; for (int
+  // rankp = 0; rankp < rank; ++rankp) {
+  //   std::cout << mr_sizes[rankp] << " ";
+  // }
+  // std::cout << std::endl;
+  // std::cout << "INFO copy_memref_to_array: strides: ";
+  // for (int rankp = 0; rankp < rank; ++rankp) {
+  //   std::cout << mr_strides[rankp] << " ";
+  // }
+  // std::cout << std::endl;
+
+  // create a special case for rank==2 and mr_strides[rank-1]==1 using memcpy
+  if (rank == 2 && mr_strides[rank - 1] == 1) {
+    int64_t size = mr_sizes[rank - 1];  // number of elements in one row
+    int64_t nRows = mr_sizes[rank - 2]; // number of rows
+    int64_t dstStride =
+        mr_strides[rank - 2]; // #elements to skip to access next row
+    int64_t srcStride =
+        srcStrides[rank - 2]; // #elements to skip to access next row
+    const int64_t elemSize = sizeof(T);
+    for (int64_t i = 0; i < nRows; ++i) {
+      // std::cout << "INFO copy_memref_to_array: memcpy: " << dstPtr << " " <<
+      // srcPtr << " " << size * elemSize << std::endl;
+      memcpy(dstPtr, srcPtr, size * elemSize); // broken
+      srcPtr += srcStride;
+      dstPtr += dstStride;
+    }
+    return;
+  }
+
+  int64_t volatile readIndex = 0;
+  int64_t volatile writeIndex = 0;
+  for (;;) {
+
+    // TODO: Try option 1 again
+    // NOTE: broken memcpy could have been a result of implicit casting
+    //       due to type mismatch
+
+    // Copy over the element, byte by byte.
+    // memcpy(dstPtr + writeIndex, srcPtr + readIndex, elemSize); // broken
+    *(dstPtr + writeIndex) = *(srcPtr + readIndex); // opt 1
+    // *(dstPtr +writeIndex) = mr_base[mr_offset +readIndex]; // opt 2
+    // dst_base[dst_offset+writeIndex] = mr_base[mr_offset +readIndex]; // opt 3
+
+    // Advance index and read position.
+    for (int64_t axis = rank - 1; axis >= 0; --axis) {
+      // Advance at current axis.
+      auto newIndex = ++indices[axis];
+      writeIndex += dstStrides[axis];
+      readIndex += 1; // Always increment, it is a flattened dense array
+
+      // If this is a valid index, we have our next index, so continue copying.
+      if (mr_sizes[axis] != newIndex)
+        break;
+      // We reached the end of this axis. If this is axis 0, we are done.
+      if (axis == 0)
+        return;
+      // Else, reset to 0 and undo the advancement of the linear index that
+      // this axis had. Then continue with the axis one outer.
+      indices[axis] = 0;
+      writeIndex -= mr_sizes[axis] * dstStrides[axis];
+      // We arrived in the last element of the current axis, we must decrement
+      // writeIndex by 1 to fix the additional inc without write of this
+      // iteration`
+      readIndex -= 1;
+    }
+  }
+}
+
+template <typename T>
+int dma::mlir_dma_copy_from_outbuffer(T *mr_base, int64_t mr_dim,
+                                      int64_t mr_rank, int64_t mr_offset,
+                                      const int64_t *mr_sizes,
+                                      const int64_t *mr_strides,
+                                      int dma_offset) {
+
+  // std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ << "]\n";
+  LOG(__FILE__ << ": " << __LINE__ << " [" << __func__ << "]\n");
+  copy_array_to_memref(mr_base, mr_dim, mr_rank, mr_offset, mr_sizes,
+                       mr_strides, dma_get_outbuffer(), dma_offset);
+
+  return 0;
+}
+
+// Make templates concrete:
+template int dma::mlir_dma_copy_to_inbuffer<float>(
+    float *mr_base, int64_t mr_dim, int64_t mr_rank, int64_t mr_offset,
+    const int64_t *mr_sizes, const int64_t *mr_strides, int dma_offset);
+
+template int dma::mlir_dma_copy_to_inbuffer<int>(
+    int *mr_base, int64_t mr_dim, int64_t mr_rank, int64_t mr_offset,
+    const int64_t *mr_sizes, const int64_t *mr_strides, int dma_offset);
+
+template int dma::mlir_dma_copy_from_outbuffer<float>(
+    float *mr_base, int64_t mr_dim, int64_t mr_rank, int64_t mr_offset,
+    const int64_t *mr_sizes, const int64_t *mr_strides, int dma_offset);
+
+template int dma::mlir_dma_copy_from_outbuffer<int>(
+    int *mr_base, int64_t mr_dim, int64_t mr_rank, int64_t mr_offset,
+    const int64_t *mr_sizes, const int64_t *mr_strides, int dma_offset);
+
+int dma::dma_start_send(int length, int offset) {
+  LOG("SystemC dma_start_send()");
+  dmad->input_len = length;
+  dmad->input_offset = offset;
+  dmad->send = true;
+  PFUNC(dma_send_length += length);
+  PFUNC(dma_send_count++);
+  return 0;
+}
+
+void dma::dma_wait_send() {
+  LOG("SystemC dma_wait_send() starts simulation");
+  sc_start();
+}
+
+int dma::dma_check_send() {
+  LOG("SystemC dma_check_send() does nothing");
+  return 0;
+}
+
+int dma::dma_start_recv(int length, int offset) {
+  LOG("SystemC dma_start_recv()");
+  dmad->output_len = length;
+  dmad->output_offset = offset;
+  dmad->recv = true;
+  PFUNC(dma_recv_count++);
+  return 0;
+}
+
+void dma::dma_wait_recv() {
+  LOG("SystemC dma_wait_recv() starts simulation");
+  sc_start();
+  PFUNC(dma_recv_length += dmad->output_len);
+}
+
+int dma::dma_check_recv() {
+  LOG("SystemC dma_check_recv() does nothing");
+  return 0;
+}
+
+// We really don't need any of the functions below to be implemented for SystemC
+//********************************** Unexposed Functions
+//**********************************
+void dma::initDMAControls() {
+  dma_set(dma_address, S2MM_CONTROL_REGISTER, 4);
+  dma_set(dma_address, MM2S_CONTROL_REGISTER, 4);
+  dma_set(dma_address, S2MM_CONTROL_REGISTER, 0);
+  dma_set(dma_address, MM2S_CONTROL_REGISTER, 0);
+  dma_set(dma_address, S2MM_DESTINATION_ADDRESS,
+          (unsigned long)dma_output_address); // Write destination address
+  dma_set(dma_address, MM2S_START_ADDRESS,
+          (unsigned long)dma_input_address); // Write source address
+  dma_set(dma_address, S2MM_CONTROL_REGISTER, 0xf001);
+  dma_set(dma_address, MM2S_CONTROL_REGISTER, 0xf001);
+}
+
+void dma::dma_set(unsigned int *dma_address, int offset, unsigned int value) {
+  dma_address[offset >> 2] = value;
+}
+
+unsigned int dma::dma_get(unsigned int *dma_address, int offset) {
+  return dma_address[offset >> 2];
+}
+
+void dma::dma_mm2s_sync() {
+  msync(dma_address, PAGE_SIZE, MS_SYNC);
+  unsigned int mm2s_status = dma_get(dma_address, MM2S_STATUS_REGISTER);
+  while (!(mm2s_status & 1 << 12) || !(mm2s_status & 1 << 1)) {
+    msync(dma_address, PAGE_SIZE, MS_SYNC);
+    mm2s_status = dma_get(dma_address, MM2S_STATUS_REGISTER);
+  }
+}
+
+void dma::dma_s2mm_sync() {
+  msync(dma_address, PAGE_SIZE, MS_SYNC);
+  unsigned int s2mm_status = dma_get(dma_address, S2MM_STATUS_REGISTER);
+  while (!(s2mm_status & 1 << 12) || !(s2mm_status & 1 << 1)) {
+    msync(dma_address, PAGE_SIZE, MS_SYNC);
+    s2mm_status = dma_get(dma_address, S2MM_STATUS_REGISTER);
+  }
+}
+
+void dma::acc_init(unsigned int base_addr, int length) {
+  int dh = open("/dev/mem", O_RDWR | O_SYNC);
+  size_t virt_base = base_addr & ~(PAGE_SIZE - 1);
+  size_t virt_offset = base_addr - virt_base;
+  void *addr = mmap(NULL, length + virt_offset, PROT_READ | PROT_WRITE,
+                    MAP_SHARED, dh, virt_base);
+  close(dh);
+  if (addr == (void *)-1)
+    exit(EXIT_FAILURE);
+  acc_address = reinterpret_cast<unsigned int *>(addr);
+}
+
+void dma::dump_acc_signals(int state) {
+  msync(acc_address, PAGE_SIZE, MS_SYNC);
+  std::ofstream file;
+  file.open("dump_acc_signals.dat", std::ios_base::app);
+  file << "====================================================" << std::endl;
+  file << "State: " << state << std::endl;
+  file << "====================================================" << std::endl;
+  for (int i = 0; i < 16; i++)
+    file << acc_address[i] << ",";
+  file << "====================================================" << std::endl;
+}
\ No newline at end of file
diff --git a/lib/ExecutionEngine/axi/api/api_v2.cpp b/lib/ExecutionEngine/axi/api/api_v2.cpp
new file mode 100644
index 0000000..38464db
--- /dev/null
+++ b/lib/ExecutionEngine/axi/api/api_v2.cpp
@@ -0,0 +1,521 @@
+//===- api_v2.cpp - AXI core API implementation ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the core functions to use the AXI DMA interface.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/ExecutionEngine/axi/api_v2.h"
+
+#ifdef __arm__
+#include "arm_neon.h"
+#endif
+
+void dma::dma_init(unsigned int _dma_address, unsigned int _dma_input_address,
+                   unsigned int _dma_input_buffer_size, unsigned int _isize,
+                   unsigned int _dma_output_address,
+                   unsigned int _dma_output_buffer_size, unsigned int _osize) {
+
+  dma_input_buffer_size = _dma_input_buffer_size;
+  dma_output_buffer_size = _dma_output_buffer_size;
+  dma_input_paddress = _dma_input_address;
+  dma_output_paddress = _dma_output_address;
+  isize = _isize;
+  osize = _osize;
+
+  unsigned int in_size_bytes = dma_input_buffer_size * isize;
+  unsigned int out_size_bytes = dma_output_buffer_size * osize;
+  int dh = open("/dev/mem", O_RDWR | O_SYNC);
+  void *dma_mm = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, dh,
+                      _dma_address); // Memory map AXI Lite register block
+  void *dma_in_mm =
+      mmap(NULL, in_size_bytes, PROT_READ | PROT_WRITE, MAP_SHARED, dh,
+           _dma_input_address); // Memory map source address
+  void *dma_out_mm =
+      mmap(NULL, out_size_bytes, PROT_READ, MAP_SHARED, dh,
+           _dma_output_address); // Memory map destination address
+
+  dma_address = reinterpret_cast<unsigned int *>(dma_mm);
+  dma_input_address = reinterpret_cast<char *>(dma_in_mm);
+  dma_output_address = reinterpret_cast<char *>(dma_out_mm);
+
+  close(dh);
+  initDMAControls(); // Causes Segfault atm
+  LOG("DMA Initialised");
+}
+
+void dma::dma_free() {
+  unsigned int in_size_bytes = dma_input_buffer_size * isize;
+  unsigned int out_size_bytes = dma_output_buffer_size * osize;
+  munmap(dma_input_address, in_size_bytes);
+  munmap(dma_output_address, out_size_bytes);
+  munmap(dma_address, getpagesize());
+}
+
+// We could reduce to one set of the following calls
+//==============================================================================
+
+char *dma::dma_get_inbuffer() { return dma_input_address; }
+
+char *dma::dma_get_outbuffer() { return dma_output_address; }
+//==============================================================================
+
+// Removing these functions for now
+// int dma::dma_copy_to_inbuffer(unsigned int *src_address, int data_length,
+//                               int offset) {
+//   m_assert("data copy will overflow input buffer",
+//            (unsigned int)(offset + data_length) <= dma_input_buffer_size);
+//   std::memcpy(dma_input_address + offset, src_address, data_length * 4);
+//   current_input_offset += data_length;
+//   return 0;
+// }
+
+// int dma::dma_copy_from_outbuffer(unsigned int *dst_address, int data_length,
+//                                  int offset) {
+//   m_assert("tries to access data outwith the output buffer",
+//            (unsigned int)(offset + data_length) <= dma_output_buffer_size);
+//   std::memcpy(dst_address, dma_output_address + offset, data_length * 4);
+//   return 0;
+// }
+
+template <typename T>
+inline void copy_memref_to_array(T *mr_base, int64_t mr_dim, int64_t mr_rank,
+                                 int64_t mr_offset, const int64_t *mr_sizes,
+                                 const int64_t *mr_strides, char *dst_base,
+                                 const int dst_offset) {
+  int64_t rank = mr_rank;
+  // Handle empty shapes -> nothing to copy.
+  for (int rankp = 0; rankp < rank; ++rankp)
+    if (mr_sizes[rankp] == 0)
+      return;
+
+  T *srcPtr;
+  srcPtr = mr_base + mr_offset;
+
+  T *dstPtr;
+  dstPtr = reinterpret_cast<T *>(dst_base) + dst_offset;
+
+  if (rank == 0) {
+    // memcpy(dstPtr, srcPtr, elemSize); // broken
+    *dstPtr = *srcPtr; // opt 1
+    // *dstPtr = mr_base[mr_offset]; // opt 2
+    // dst_base[dst_offset] = mr_base[mr_offset]; // opt 3
+    return;
+  }
+
+  int64_t *indices = static_cast<int64_t *>(alloca(sizeof(int64_t) * rank));
+  int64_t *srcStrides = static_cast<int64_t *>(alloca(sizeof(int64_t) * rank));
+  int64_t *dstStrides = static_cast<int64_t *>(alloca(sizeof(int64_t) * rank));
+
+  // Initialize index and scale strides.
+  for (int rankp = 0; rankp < rank; ++rankp) {
+    indices[rankp] = 0;
+    srcStrides[rankp] = mr_strides[rankp];
+
+    // dstStrides for the array is derived from the input mr_sizes
+    // if the rank is 3, and the mr_sizes are 4x8x16, the dstStrides are
+    // 128x16x1
+    dstStrides[rankp] = 1;
+    for (int rankp2 = rankp + 1; rankp2 < rank; ++rankp2) {
+      dstStrides[rankp] *= mr_sizes[rankp2];
+    }
+  }
+
+  // DEBUG:
+  // std::cout << "INFO copy_memref_to_array: rank: " << rank << std::endl;
+  // std::cout << "INFO copy_memref_to_array: offset: " << mr_offset <<
+  // std::endl; std::cout << "INFO copy_memref_to_array: sizes: "; for (int
+  // rankp = 0; rankp < rank; ++rankp) {
+  //   std::cout << mr_sizes[rankp] << " ";
+  // }
+  // std::cout << std::endl;
+  // std::cout << "INFO copy_memref_to_array: strides: ";
+  // for (int rankp = 0; rankp < rank; ++rankp) {
+  //   std::cout << mr_strides[rankp] << " ";
+  // }
+  // std::cout << std::endl;
+
+  // create a special case for rank==2 and strides[rank-1]==1 using memcpy
+  if (rank == 2 && mr_strides[rank - 1] == 1) {
+    int64_t size = mr_sizes[rank - 1];        // number of elements
+    int64_t count = mr_sizes[rank - 2];       // number of rows
+    int64_t srcStride = mr_strides[rank - 2]; // stride between rows
+    int64_t dstStride = dstStrides[rank - 2]; // stride between rows
+    const int64_t elemSize = sizeof(T);
+    for (int64_t i = 0; i < count; ++i) {
+      // std::cout << "INFO copy_memref_to_array: memcpy: " << dstPtr << " " <<
+      // srcPtr << " " << size * elemSize << std::endl;
+      memcpy(dstPtr, srcPtr, size * elemSize); // broken
+      srcPtr += srcStride;
+      dstPtr += dstStride;
+    }
+    return;
+  }
+
+  int64_t volatile readIndex = 0;
+  int64_t volatile writeIndex = 0;
+  for (;;) {
+    D(std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ << "offset]"
+                << dst_offset << "\n";
+      std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ << "SRC]"
+                << srcPtr << "\n";
+      std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ << "DST]"
+                << dstPtr << "\n";
+      std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__
+                << "load from]" << srcPtr + readIndex << "\n";
+      std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__
+                << "store at]" << dstPtr + writeIndex << "\n";
+      std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__
+                << "loaded val]" << *(srcPtr + readIndex) << "\n";
+      std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__
+                << "stored val]" << *(dstPtr + writeIndex) << "\n";);
+
+    // TODO: Try option 1 again
+    // NOTE: broken memcpy could have been a result of implicit casting
+    //       due to type mismatch
+
+    // Copy over the element, byte by byte.
+    // memcpy(dstPtr + writeIndex, srcPtr + readIndex, elemSize); // broken
+    *(dstPtr + writeIndex) = *(srcPtr + readIndex); // opt 1
+    // *(dstPtr +writeIndex) = mr_base[mr_offset +readIndex]; // opt 2
+    // dst_base[dst_offset+writeIndex] = mr_base[mr_offset +readIndex]; // opt 3
+
+    // Advance index and read position.
+    for (int64_t axis = rank - 1; axis >= 0; --axis) {
+      // Advance at current axis.
+      auto newIndex = ++indices[axis];
+      readIndex += srcStrides[axis];
+      writeIndex += 1; // Always increment, it is a flattened dense array
+      // If this is a valid index, we have our next index, so continue copying.
+      if (mr_sizes[axis] != newIndex)
+        break;
+      // We reached the end of this axis. If this is axis 0, we are done.
+      if (axis == 0)
+        return;
+      // Else, reset to 0 and undo the advancement of the linear index that
+      // this axis had. Then continue with the axis one outer.
+      indices[axis] = 0;
+      readIndex -= mr_sizes[axis] * srcStrides[axis];
+      // We arrived in the last element of the current axis, we must decrement
+      // writeIndex by 1 to fix the additional inc without write of this
+      // iteration`
+      writeIndex -= 1;
+    }
+  }
+}
+
+// Implements the actual copy
+template <typename T>
+int dma::mlir_dma_copy_to_inbuffer(T *mr_base, int64_t mr_dim, int64_t mr_rank,
+                                   int64_t mr_offset, const int64_t *mr_sizes,
+                                   const int64_t *mr_strides, int dma_offset) {
+  D(std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ << "]\n";);
+
+  copy_memref_to_array(mr_base, mr_dim, mr_rank, mr_offset, mr_sizes,
+                       mr_strides, dma_get_inbuffer(), dma_offset);
+
+  return 0;
+}
+
+template <typename T>
+inline void copy_array_to_memref(T *mr_base, int64_t mr_dim, int64_t mr_rank,
+                                 int64_t mr_offset, const int64_t *mr_sizes,
+                                 const int64_t *mr_strides, char *src_base,
+                                 const int src_offset, int elebytes) {
+  int64_t rank = mr_rank;
+  // Handle empty shapes -> nothing to copy.
+  for (int rankp = 0; rankp < rank; ++rankp)
+    if (mr_sizes[rankp] == 0)
+      return;
+
+  T *dstPtr;
+  dstPtr = mr_base + mr_offset;
+
+  T *srcPtr;
+  srcPtr = reinterpret_cast<T *>(src_base) + src_offset;
+
+  if (rank == 0) {
+    // memcpy(dstPtr, srcPtr, elemSize); // broken
+    *dstPtr = *srcPtr; // opt 1
+    // *dstPtr = mr_base[mr_offset]; // opt 2
+    // dst_base[dst_offset] = mr_base[mr_offset]; // opt 3
+    return;
+  }
+
+  int64_t *indices = static_cast<int64_t *>(alloca(sizeof(int64_t) * rank));
+  int64_t *srcStrides = static_cast<int64_t *>(alloca(sizeof(int64_t) * rank));
+  int64_t *dstStrides = static_cast<int64_t *>(alloca(sizeof(int64_t) * rank));
+
+  // Initialize index and scale strides.
+  for (int rankp = 0; rankp < rank; ++rankp) {
+    indices[rankp] = 0;
+    dstStrides[rankp] = mr_strides[rankp];
+
+    // srcStrides for the array is derived from the output mr_sizes
+    // if the rank is 3, and the mr_sizes are 4x8x16, the srcStrides are
+    // 128x16x1
+    srcStrides[rankp] = 1;
+    for (int rankp2 = rankp + 1; rankp2 < rank; ++rankp2) {
+      srcStrides[rankp] *= mr_sizes[rankp2];
+    }
+  }
+
+  // DEBUG:
+  // std::cout << "INFO copy_memref_to_array: rank: " << rank << std::endl;
+  // std::cout << "INFO copy_memref_to_array: offset: " << mr_offset <<
+  // std::endl; std::cout << "INFO copy_memref_to_array: sizes: "; for (int
+  // rankp = 0; rankp < rank; ++rankp) {
+  //   std::cout << mr_sizes[rankp] << " ";
+  // }
+  // std::cout << std::endl;
+  // std::cout << "INFO copy_memref_to_array: strides: ";
+  // for (int rankp = 0; rankp < rank; ++rankp) {
+  //   std::cout << mr_strides[rankp] << " ";
+  // }
+  // std::cout << std::endl;
+
+  // create a special case for rank==2 and mr_strides[rank-1]==1 using memcpy
+  if (rank == 2 && mr_strides[rank - 1] == 1) {
+    int64_t size = mr_sizes[rank - 1];  // number of elements in one row
+    int64_t nRows = mr_sizes[rank - 2]; // number of rows
+    int64_t dstStride =
+        mr_strides[rank - 2]; // #elements to skip to access next row
+    int64_t srcStride =
+        srcStrides[rank - 2]; // #elements to skip to access next row
+    const int64_t elemSize = sizeof(T);
+    for (int64_t i = 0; i < nRows; ++i) {
+      // std::cout << "INFO copy_memref_to_array: memcpy: " << dstPtr << " " <<
+      // srcPtr << " " << size * elemSize << std::endl;
+      memcpy(dstPtr, srcPtr, size * elemSize); // broken
+      srcPtr += srcStride;
+      dstPtr += dstStride;
+    }
+    return;
+  }
+
+  int64_t volatile readIndex = 0;
+  int64_t volatile writeIndex = 0;
+  for (;;) {
+    D(std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ << "offset]"
+                << src_offset << "\n";
+      std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ << "SRC]"
+                << srcPtr << "\n";
+      std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ << "DST]"
+                << dstPtr << "\n";
+      std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__
+                << "load from]" << srcPtr + readIndex << "\n";
+      std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__
+                << "store at]" << dstPtr + writeIndex << "\n";
+      std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__
+                << "loaded val]" << *(srcPtr + readIndex) << "\n";
+      std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__
+                << "stored val]" << *(dstPtr + writeIndex) << "\n";);
+
+    // TODO: Try option 1 again
+    // NOTE: broken memcpy could have been a result of implicit casting
+    //       due to type mismatch
+
+    // Copy over the element, byte by byte.
+    // memcpy(dstPtr + writeIndex, srcPtr + readIndex, elemSize); // broken
+    *(dstPtr + writeIndex) = *(srcPtr + readIndex); // opt 1
+    // *(dstPtr +writeIndex) = mr_base[mr_offset +readIndex]; // opt 2
+    // dst_base[dst_offset+writeIndex] = mr_base[mr_offset +readIndex]; // opt 3
+
+    // Advance index and read position.
+    for (int64_t axis = rank - 1; axis >= 0; --axis) {
+      // Advance at current axis.
+      auto newIndex = ++indices[axis];
+      writeIndex += dstStrides[axis];
+      readIndex += 1; // Always increment, it is a flattened dense array
+
+      // If this is a valid index, we have our next index, so continue copying.
+      if (mr_sizes[axis] != newIndex)
+        break;
+      // We reached the end of this axis. If this is axis 0, we are done.
+      if (axis == 0)
+        return;
+      // Else, reset to 0 and undo the advancement of the linear index that
+      // this axis had. Then continue with the axis one outer.
+      indices[axis] = 0;
+      writeIndex -= mr_sizes[axis] * dstStrides[axis];
+      // We arrived in the last element of the current axis, we must decrement
+      // writeIndex by 1 to fix the additional inc without write of this
+      // iteration`
+      readIndex -= 1;
+    }
+  }
+}
+
+template <typename T>
+int dma::mlir_dma_copy_from_outbuffer(T *mr_base, int64_t mr_dim,
+                                      int64_t mr_rank, int64_t mr_offset,
+                                      const int64_t *mr_sizes,
+                                      const int64_t *mr_strides,
+                                      int dma_offset) {
+
+  D(std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ << "]\n";);
+
+  copy_array_to_memref(mr_base, mr_dim, mr_rank, mr_offset, mr_sizes,
+                       mr_strides, dma_get_outbuffer(), dma_offset);
+
+  return 0;
+}
+
+// Make templates concrete:
+template int dma::mlir_dma_copy_to_inbuffer<float>(
+    float *mr_base, int64_t mr_dim, int64_t mr_rank, int64_t mr_offset,
+    const int64_t *mr_sizes, const int64_t *mr_strides, int dma_offset);
+
+template int dma::mlir_dma_copy_to_inbuffer<int>(
+    int *mr_base, int64_t mr_dim, int64_t mr_rank, int64_t mr_offset,
+    const int64_t *mr_sizes, const int64_t *mr_strides, int dma_offset);
+
+template int dma::mlir_dma_copy_from_outbuffer<float>(
+    float *mr_base, int64_t mr_dim, int64_t mr_rank, int64_t mr_offset,
+    const int64_t *mr_sizes, const int64_t *mr_strides, int dma_offset);
+
+template int dma::mlir_dma_copy_from_outbuffer<int>(
+    int *mr_base, int64_t mr_dim, int64_t mr_rank, int64_t mr_offset,
+    const int64_t *mr_sizes, const int64_t *mr_strides, int dma_offset);
+
+// DMA Functions
+// Updated for char
+int dma::dma_start_send(unsigned int length, unsigned int offset) {
+  m_assert("trying to send data outside the input buffer",
+           (offset + length) <= dma_input_buffer_size);
+  unsigned int new_length = length * isize;
+  unsigned int new_offset = offset * isize;
+  dma_set(dma_address, MM2S_START_ADDRESS, dma_input_paddress + new_offset);
+  msync(dma_address, PAGE_SIZE, MS_SYNC);
+  dma_set(dma_address, MM2S_LENGTH, new_length);
+  LOG("Transfer Started - " << new_length << " bytes");
+  return 0;
+}
+
+void dma::dma_wait_send() {
+  LOG("Data Transfer - Waiting");
+  dma_mm2s_sync();
+  LOG("Data Transfer - Done");
+}
+
+int dma::dma_check_send() {
+  unsigned int mm2s_status = dma_get(dma_address, MM2S_STATUS_REGISTER);
+  bool done = !((!(mm2s_status & 1 << 12)) || (!(mm2s_status & 1 << 1)));
+  if (done) {
+    LOG("Data Transfer - Done");
+  } else {
+    LOG("Data Transfer - Not Done");
+  }
+  return done ? 0 : -1;
+}
+
+// Updated for char
+int dma::dma_start_recv(unsigned int length, unsigned int offset) {
+  m_assert("trying receive data outside the output buffer",
+           (offset + length) <= dma_output_buffer_size);
+  unsigned int new_length = length * osize;
+  unsigned int new_offset = offset * osize;
+  dma_set(dma_address, S2MM_DESTINATION_ADDRESS,
+          dma_output_paddress + new_offset);
+  msync(dma_address, PAGE_SIZE, MS_SYNC);
+  dma_set(dma_address, S2MM_LENGTH, new_length);
+  LOG("Started Receiving " << new_length << " bytes");
+  return 0;
+}
+
+void dma::dma_wait_recv() {
+  LOG("Data Receive - Waiting");
+  dma_s2mm_sync();
+  LOG("Data Received - " << dma_get(dma_address, S2MM_LENGTH) << " bytes");
+}
+
+int dma::dma_check_recv() {
+  unsigned int s2mm_status = dma_get(dma_address, S2MM_STATUS_REGISTER);
+  bool done = !((!(s2mm_status & 1 << 12)) || (!(s2mm_status & 1 << 1)));
+  if (done) {
+    LOG("Data Receive - Done");
+  } else {
+    LOG("Data Receive - Not Done");
+  }
+  return done ? 0 : -1;
+}
+
+//********************************** Unexposed Functions
+//**********************************
+
+void dma::initDMAControls() {
+  dma_set(dma_address, S2MM_CONTROL_REGISTER, 4);
+  dma_set(dma_address, MM2S_CONTROL_REGISTER, 4);
+  dma_set(dma_address, S2MM_CONTROL_REGISTER, 0);
+  dma_set(dma_address, MM2S_CONTROL_REGISTER, 0);
+  // dma_set(dma_address, S2MM_DESTINATION_ADDRESS,
+  //         (unsigned long)dma_output_address); // Write destination address
+  // dma_set(dma_address, MM2S_START_ADDRESS,
+  //         (unsigned long)dma_input_address); // Write source address
+  dma_set(dma_address, S2MM_DESTINATION_ADDRESS,
+          dma_output_paddress); // Write destination address
+  dma_set(dma_address, MM2S_START_ADDRESS,
+          dma_input_paddress); // Write source address
+  dma_set(dma_address, S2MM_CONTROL_REGISTER, 0xf001);
+  dma_set(dma_address, MM2S_CONTROL_REGISTER, 0xf001);
+}
+
+void dma::dma_set(unsigned int *dma_address, int offset, unsigned int value) {
+  *((volatile unsigned int *)(reinterpret_cast<char *>(dma_address) + offset)) =
+      value;
+  // dma_address[offset >> 2] = value;
+}
+
+unsigned int dma::dma_get(unsigned int *dma_address, int offset) {
+  return *((volatile unsigned int *)(reinterpret_cast<char *>(dma_address) +
+                                     offset));
+  // return *((volatile unsigned int*) dma_address[offset >> 2]);
+  // return dma_address[offset >> 2];
+}
+
+void dma::dma_mm2s_sync() {
+  msync(dma_address, PAGE_SIZE, MS_SYNC);
+  unsigned int mm2s_status = dma_get(dma_address, MM2S_STATUS_REGISTER);
+  while (!(mm2s_status & 1 << 12) || !(mm2s_status & 1 << 1)) {
+    msync(dma_address, PAGE_SIZE, MS_SYNC);
+    mm2s_status = dma_get(dma_address, MM2S_STATUS_REGISTER);
+  }
+}
+
+void dma::dma_s2mm_sync() {
+  msync(dma_address, PAGE_SIZE, MS_SYNC);
+  unsigned int s2mm_status = dma_get(dma_address, S2MM_STATUS_REGISTER);
+  while (!(s2mm_status & 1 << 12) || !(s2mm_status & 1 << 1)) {
+    msync(dma_address, PAGE_SIZE, MS_SYNC);
+    s2mm_status = dma_get(dma_address, S2MM_STATUS_REGISTER);
+  }
+}
+
+void dma::acc_init(unsigned int base_addr, int length) {
+  int dh = open("/dev/mem", O_RDWR | O_SYNC);
+  size_t virt_base = base_addr & ~(PAGE_SIZE - 1);
+  size_t virt_offset = base_addr - virt_base;
+  void *addr = mmap(NULL, length + virt_offset, PROT_READ | PROT_WRITE,
+                    MAP_SHARED, dh, virt_base);
+  close(dh);
+  if (addr == (void *)-1)
+    exit(EXIT_FAILURE);
+  acc_address = reinterpret_cast<unsigned int *>(addr);
+}
+
+void dma::dump_acc_signals(int state) {
+  msync(acc_address, PAGE_SIZE, MS_SYNC);
+  std::ofstream file;
+  file.open("dump_acc_signals.dat", std::ios_base::app);
+  file << "====================================================" << std::endl;
+  file << "State: " << state << std::endl;
+  file << "====================================================" << std::endl;
+  for (int i = 0; i < 16; i++)
+    file << acc_address[i] << ",";
+  file << "====================================================" << std::endl;
+}
\ No newline at end of file
diff --git a/lib/ExecutionEngine/axi/api/api_v2_sysc.cpp b/lib/ExecutionEngine/axi/api/api_v2_sysc.cpp
new file mode 100644
index 0000000..0a0b658
--- /dev/null
+++ b/lib/ExecutionEngine/axi/api/api_v2_sysc.cpp
@@ -0,0 +1,472 @@
+//===- api_v2.cpp - AXI core API implementation ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the core functions to use the AXI DMA interface.
+//
+//===----------------------------------------------------------------------===//
+
+#define SYSC
+#include "mlir/ExecutionEngine/axi/api_v2.h"
+
+int sc_main(int argc, char *argv[]) { return 0; }
+
+// SystemC code does not require all these parameters
+void dma::dma_init(unsigned int _dma_address, unsigned int _dma_input_address,
+                   unsigned int _dma_input_buffer_size, unsigned int _isize,
+                   unsigned int _dma_output_address,
+                   unsigned int _dma_output_buffer_size, unsigned int _osize) {
+
+  sc_report_handler::set_actions("/IEEE_Std_1666/deprecated", SC_DO_NOTHING);
+  sc_report_handler::set_actions(SC_ID_LOGIC_X_TO_BOOL_, SC_LOG);
+  sc_report_handler::set_actions(SC_ID_VECTOR_CONTAINS_LOGIC_VALUE_, SC_LOG);
+
+  dma_input_buffer_size = _dma_input_buffer_size;
+  dma_output_buffer_size = _dma_output_buffer_size;
+  dma_input_paddress = 0;
+  dma_output_paddress = 0;
+  isize = _isize;
+  osize = _osize;
+  unsigned int in_size_bytes = dma_input_buffer_size * isize;
+  unsigned int out_size_bytes = dma_output_buffer_size * osize;
+
+  dma_input_address = new char[in_size_bytes]();
+  dma_output_address = new char[out_size_bytes]();
+
+  static ACCNAME dut("dut");
+  static DMA_DRIVER dm("DMA");
+  accelerator_dma_connect<int>(&dut, &dm, _dma_input_buffer_size,
+                               _dma_output_buffer_size);
+
+  dm.DMA_input_buffer = (int *)dma_input_address;
+  dm.DMA_output_buffer = (int *)dma_output_address;
+  dm.isize = isize;
+  dm.osize = osize;
+  acc = &dut;
+  dmad = &dm;
+  acc->verbose = verbose;
+  LOG("SystemC dma_init() initializes the DMA");
+}
+
+void dma::dma_free() {
+  LOG("SystemC dma_free() deallocates DMA buffers");
+  LOG("++++++++++++++++++++++++++++++++++++++++");
+  LOG("SystemC simulated cycles: " << sc_time_stamp());
+  LOG("DMA Send count: " << dma_send_count);
+  LOG("DMA Send length: " << dma_send_length);
+  LOG("DMA Recv count: " << dma_recv_count);
+  LOG("DMA Recv length: " << dma_recv_length);
+  LOG("++++++++++++++++++++++++++++++++++++++++");
+  acc->print_profile();
+
+  delete[] dma_input_address;
+  delete[] dma_output_address;
+}
+
+char *dma::dma_get_inbuffer() { return dma_input_address; }
+
+char *dma::dma_get_outbuffer() { return dma_output_address; }
+
+// Removing these functions for now
+// int dma::dma_copy_to_inbuffer(unsigned int *src_address, int data_length,
+//                               int offset) {
+//   LOG("SystemC dma_copy_to_inbuffer()");
+//   m_assert("data copy will overflow input buffer",
+//            (unsigned int)(offset + data_length) <= dma_input_buffer_size);
+//   memcpy((dma_get_inbuffer() + offset), src_address, data_length * 4);
+//   return 0;
+// }
+
+// int dma::dma_copy_from_outbuffer(unsigned int *dst_address, int data_length,
+//                                  int offset) {
+//   LOG("SystemC dma_copy_from_outbuffer()");
+//   m_assert("tries to access data out with the output buffer",
+//            (unsigned int)(offset + data_length) <= dma_output_buffer_size);
+//   memcpy(dst_address, (dma_get_outbuffer() + offset), data_length * 4);
+//   return 0;
+// }
+
+template <typename T>
+inline void copy_memref_to_array(T *mr_base, int64_t mr_dim, int64_t mr_rank,
+                                 int64_t mr_offset, const int64_t *mr_sizes,
+                                 const int64_t *mr_strides, char *dst_base,
+                                 const int dst_offset) {
+  int64_t rank = mr_rank;
+  // Handle empty shapes -> nothing to copy.
+  for (int rankp = 0; rankp < rank; ++rankp)
+    if (mr_sizes[rankp] == 0)
+      return;
+
+  T *srcPtr;
+  srcPtr = mr_base + mr_offset;
+
+  T *dstPtr;
+  dstPtr = reinterpret_cast<T *>(dst_base) + dst_offset;
+
+  if (rank == 0) {
+    // memcpy(dstPtr, srcPtr, elemSize); // broken
+    *dstPtr = *srcPtr; // opt 1
+    // *dstPtr = mr_base[mr_offset]; // opt 2
+    // dst_base[dst_offset] = mr_base[mr_offset]; // opt 3
+    return;
+  }
+
+  int64_t *indices = static_cast<int64_t *>(alloca(sizeof(int64_t) * rank));
+  int64_t *srcStrides = static_cast<int64_t *>(alloca(sizeof(int64_t) * rank));
+  int64_t *dstStrides = static_cast<int64_t *>(alloca(sizeof(int64_t) * rank));
+
+  // Initialize index and scale strides.
+  for (int rankp = 0; rankp < rank; ++rankp) {
+    indices[rankp] = 0;
+    srcStrides[rankp] = mr_strides[rankp];
+
+    // dstStrides for the array is derived from the input mr_sizes
+    // if the rank is 3, and the mr_sizes are 4x8x16, the dstStrides are
+    // 128x16x1
+    dstStrides[rankp] = 1;
+    for (int rankp2 = rankp + 1; rankp2 < rank; ++rankp2) {
+      dstStrides[rankp] *= mr_sizes[rankp2];
+    }
+  }
+
+  // DEBUG:
+  // std::cout << "INFO copy_memref_to_array: rank: " << rank << std::endl;
+  // std::cout << "INFO copy_memref_to_array: offset: " << mr_offset <<
+  // std::endl; std::cout << "INFO copy_memref_to_array: sizes: "; for (int
+  // rankp = 0; rankp < rank; ++rankp) {
+  //   std::cout << mr_sizes[rankp] << " ";
+  // }
+  // std::cout << std::endl;
+  // std::cout << "INFO copy_memref_to_array: strides: ";
+  // for (int rankp = 0; rankp < rank; ++rankp) {
+  //   std::cout << mr_strides[rankp] << " ";
+  // }
+  // std::cout << std::endl;
+
+  // create a special case for rank==2 and strides[rank-1]==1 using memcpy
+  if (rank == 2 && mr_strides[rank - 1] == 1) {
+    int64_t size = mr_sizes[rank - 1];        // number of elements
+    int64_t count = mr_sizes[rank - 2];       // number of rows
+    int64_t srcStride = mr_strides[rank - 2]; // stride between rows
+    int64_t dstStride = dstStrides[rank - 2]; // stride between rows
+    const int64_t elemSize = sizeof(T);
+    for (int64_t i = 0; i < count; ++i) {
+      // std::cout << "INFO copy_memref_to_array: memcpy: " << dstPtr << " " <<
+      // srcPtr << " " << size * elemSize << std::endl;
+      memcpy(dstPtr, srcPtr, size * elemSize); // broken
+      srcPtr += srcStride;
+      dstPtr += dstStride;
+    }
+    return;
+  }
+
+  int64_t volatile readIndex = 0;
+  int64_t volatile writeIndex = 0;
+  for (;;) {
+    // TODO: Try option 1 again
+    // NOTE: broken memcpy could have been a result of implicit casting
+    //       due to type mismatch
+
+    // Copy over the element, byte by byte.
+    // memcpy(dstPtr + writeIndex, srcPtr + readIndex, elemSize); // broken
+    *(dstPtr + writeIndex) = *(srcPtr + readIndex); // opt 1
+    // *(dstPtr +writeIndex) = mr_base[mr_offset +readIndex]; // opt 2
+    // dst_base[dst_offset+writeIndex] = mr_base[mr_offset +readIndex]; // opt 3
+
+    // Advance index and read position.
+    for (int64_t axis = rank - 1; axis >= 0; --axis) {
+      // Advance at current axis.
+      auto newIndex = ++indices[axis];
+      readIndex += srcStrides[axis];
+      writeIndex += 1; // Always increment, it is a flattened dense array
+      // If this is a valid index, we have our next index, so continue copying.
+      if (mr_sizes[axis] != newIndex)
+        break;
+      // We reached the end of this axis. If this is axis 0, we are done.
+      if (axis == 0)
+        return;
+      // Else, reset to 0 and undo the advancement of the linear index that
+      // this axis had. Then continue with the axis one outer.
+      indices[axis] = 0;
+      readIndex -= mr_sizes[axis] * srcStrides[axis];
+      // We arrived in the last element of the current axis, we must decrement
+      // writeIndex by 1 to fix the additional inc without write of this
+      // iteration`
+      writeIndex -= 1;
+    }
+  }
+}
+
+// Implements the actual copy
+template <typename T>
+int dma::mlir_dma_copy_to_inbuffer(T *mr_base, int64_t mr_dim, int64_t mr_rank,
+                                   int64_t mr_offset, const int64_t *mr_sizes,
+                                   const int64_t *mr_strides, int dma_offset) {
+  // std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ << "]\n";
+  LOG(__FILE__ << ": " << __LINE__ << " [" << __func__ << "]\n");
+  copy_memref_to_array(mr_base, mr_dim, mr_rank, mr_offset, mr_sizes,
+                       mr_strides, dma_get_inbuffer(), dma_offset);
+
+  return 0;
+}
+
+template <typename T>
+inline void copy_array_to_memref(T *mr_base, int64_t mr_dim, int64_t mr_rank,
+                                 int64_t mr_offset, const int64_t *mr_sizes,
+                                 const int64_t *mr_strides, char *src_base,
+                                 const int src_offset) {
+  int64_t rank = mr_rank;
+  // Handle empty shapes -> nothing to copy.
+  for (int rankp = 0; rankp < rank; ++rankp)
+    if (mr_sizes[rankp] == 0)
+      return;
+
+  T *dstPtr;
+  dstPtr = mr_base + mr_offset;
+
+  T *srcPtr;
+  srcPtr = reinterpret_cast<T *>(src_base) + src_offset;
+
+  if (rank == 0) {
+    // memcpy(dstPtr, srcPtr, elemSize); // broken
+    *dstPtr = *srcPtr; // opt 1
+    // *dstPtr = mr_base[mr_offset]; // opt 2
+    // dst_base[dst_offset] = mr_base[mr_offset]; // opt 3
+    return;
+  }
+
+  int64_t *indices = static_cast<int64_t *>(alloca(sizeof(int64_t) * rank));
+  int64_t *srcStrides = static_cast<int64_t *>(alloca(sizeof(int64_t) * rank));
+  int64_t *dstStrides = static_cast<int64_t *>(alloca(sizeof(int64_t) * rank));
+
+  // Initialize index and scale strides.
+  for (int rankp = 0; rankp < rank; ++rankp) {
+    indices[rankp] = 0;
+    dstStrides[rankp] = mr_strides[rankp];
+
+    // srcStrides for the array is derived from the output mr_sizes
+    // if the rank is 3, and the mr_sizes are 4x8x16, the srcStrides are
+    // 128x16x1
+    srcStrides[rankp] = 1;
+    for (int rankp2 = rankp + 1; rankp2 < rank; ++rankp2) {
+      srcStrides[rankp] *= mr_sizes[rankp2];
+    }
+  }
+
+  // DEBUG:
+  // std::cout << "INFO copy_memref_to_array: rank: " << rank << std::endl;
+  // std::cout << "INFO copy_memref_to_array: offset: " << mr_offset <<
+  // std::endl; std::cout << "INFO copy_memref_to_array: sizes: "; for (int
+  // rankp = 0; rankp < rank; ++rankp) {
+  //   std::cout << mr_sizes[rankp] << " ";
+  // }
+  // std::cout << std::endl;
+  // std::cout << "INFO copy_memref_to_array: strides: ";
+  // for (int rankp = 0; rankp < rank; ++rankp) {
+  //   std::cout << mr_strides[rankp] << " ";
+  // }
+  // std::cout << std::endl;
+
+  // create a special case for rank==2 and mr_strides[rank-1]==1 using memcpy
+  if (rank == 2 && mr_strides[rank - 1] == 1) {
+    int64_t size = mr_sizes[rank - 1];  // number of elements in one row
+    int64_t nRows = mr_sizes[rank - 2]; // number of rows
+    int64_t dstStride =
+        mr_strides[rank - 2]; // #elements to skip to access next row
+    int64_t srcStride =
+        srcStrides[rank - 2]; // #elements to skip to access next row
+    const int64_t elemSize = sizeof(T);
+    for (int64_t i = 0; i < nRows; ++i) {
+      // std::cout << "INFO copy_memref_to_array: memcpy: " << dstPtr << " " <<
+      // srcPtr << " " << size * elemSize << std::endl;
+      memcpy(dstPtr, srcPtr, size * elemSize); // broken
+      srcPtr += srcStride;
+      dstPtr += dstStride;
+    }
+    return;
+  }
+
+  int64_t volatile readIndex = 0;
+  int64_t volatile writeIndex = 0;
+  for (;;) {
+
+    // TODO: Try option 1 again
+    // NOTE: broken memcpy could have been a result of implicit casting
+    //       due to type mismatch
+
+    // Copy over the element, byte by byte.
+    // memcpy(dstPtr + writeIndex, srcPtr + readIndex, elemSize); // broken
+    *(dstPtr + writeIndex) = *(srcPtr + readIndex); // opt 1
+    // *(dstPtr +writeIndex) = mr_base[mr_offset +readIndex]; // opt 2
+    // dst_base[dst_offset+writeIndex] = mr_base[mr_offset +readIndex]; // opt 3
+
+    // Advance index and read position.
+    for (int64_t axis = rank - 1; axis >= 0; --axis) {
+      // Advance at current axis.
+      auto newIndex = ++indices[axis];
+      writeIndex += dstStrides[axis];
+      readIndex += 1; // Always increment, it is a flattened dense array
+
+      // If this is a valid index, we have our next index, so continue copying.
+      if (mr_sizes[axis] != newIndex)
+        break;
+      // We reached the end of this axis. If this is axis 0, we are done.
+      if (axis == 0)
+        return;
+      // Else, reset to 0 and undo the advancement of the linear index that
+      // this axis had. Then continue with the axis one outer.
+      indices[axis] = 0;
+      writeIndex -= mr_sizes[axis] * dstStrides[axis];
+      // We arrived in the last element of the current axis, we must decrement
+      // writeIndex by 1 to fix the additional inc without write of this
+      // iteration`
+      readIndex -= 1;
+    }
+  }
+}
+
+template <typename T>
+int dma::mlir_dma_copy_from_outbuffer(T *mr_base, int64_t mr_dim,
+                                      int64_t mr_rank, int64_t mr_offset,
+                                      const int64_t *mr_sizes,
+                                      const int64_t *mr_strides,
+                                      int dma_offset) {
+
+  // std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ << "]\n";
+  LOG(__FILE__ << ": " << __LINE__ << " [" << __func__ << "]\n");
+  copy_array_to_memref(mr_base, mr_dim, mr_rank, mr_offset, mr_sizes,
+                       mr_strides, dma_get_outbuffer(), dma_offset);
+
+  return 0;
+}
+
+// Make templates concrete:
+template int dma::mlir_dma_copy_to_inbuffer<float>(
+    float *mr_base, int64_t mr_dim, int64_t mr_rank, int64_t mr_offset,
+    const int64_t *mr_sizes, const int64_t *mr_strides, int dma_offset);
+
+template int dma::mlir_dma_copy_to_inbuffer<int>(
+    int *mr_base, int64_t mr_dim, int64_t mr_rank, int64_t mr_offset,
+    const int64_t *mr_sizes, const int64_t *mr_strides, int dma_offset);
+
+template int dma::mlir_dma_copy_from_outbuffer<float>(
+    float *mr_base, int64_t mr_dim, int64_t mr_rank, int64_t mr_offset,
+    const int64_t *mr_sizes, const int64_t *mr_strides, int dma_offset);
+
+template int dma::mlir_dma_copy_from_outbuffer<int>(
+    int *mr_base, int64_t mr_dim, int64_t mr_rank, int64_t mr_offset,
+    const int64_t *mr_sizes, const int64_t *mr_strides, int dma_offset);
+
+int dma::dma_start_send(int length, int offset) {
+  m_assert("trying to send data outside the input buffer",
+           (offset + length) <= dma_input_buffer_size);
+  LOG("SystemC dma_start_send()");
+  dmad->input_len = length;
+  dmad->input_offset = offset;
+  dmad->send = true;
+  PFUNC(dma_send_length += length);
+  PFUNC(dma_send_count++);
+  return 0;
+}
+
+void dma::dma_wait_send() {
+  LOG("SystemC dma_wait_send() starts simulation");
+  sc_start();
+}
+
+int dma::dma_check_send() {
+  LOG("SystemC dma_check_send() does nothing");
+  return 0;
+}
+
+int dma::dma_start_recv(int length, int offset) {
+  m_assert("trying receive data outside the output buffer",
+           (offset + length) <= dma_output_buffer_size);
+  LOG("SystemC dma_start_recv()");
+  dmad->output_len = length;
+  dmad->output_offset = offset;
+  dmad->recv = true;
+  PFUNC(dma_recv_count++);
+  return 0;
+}
+
+void dma::dma_wait_recv() {
+  LOG("SystemC dma_wait_recv() starts simulation");
+  sc_start();
+  PFUNC(dma_recv_length += dmad->output_len);
+}
+
+int dma::dma_check_recv() {
+  LOG("SystemC dma_check_recv() does nothing");
+  return 0;
+}
+
+// We really don't need any of the functions below to be implemented for SystemC
+//********************************** Unexposed Functions
+//**********************************
+void dma::initDMAControls() {
+  dma_set(dma_address, S2MM_CONTROL_REGISTER, 4);
+  dma_set(dma_address, MM2S_CONTROL_REGISTER, 4);
+  dma_set(dma_address, S2MM_CONTROL_REGISTER, 0);
+  dma_set(dma_address, MM2S_CONTROL_REGISTER, 0);
+  dma_set(dma_address, S2MM_DESTINATION_ADDRESS,
+          (unsigned long)dma_output_address); // Write destination address
+  dma_set(dma_address, MM2S_START_ADDRESS,
+          (unsigned long)dma_input_address); // Write source address
+  dma_set(dma_address, S2MM_CONTROL_REGISTER, 0xf001);
+  dma_set(dma_address, MM2S_CONTROL_REGISTER, 0xf001);
+}
+
+void dma::dma_set(unsigned int *dma_address, int offset, unsigned int value) {
+  dma_address[offset >> 2] = value;
+}
+
+unsigned int dma::dma_get(unsigned int *dma_address, int offset) {
+  return dma_address[offset >> 2];
+}
+
+void dma::dma_mm2s_sync() {
+  msync(dma_address, PAGE_SIZE, MS_SYNC);
+  unsigned int mm2s_status = dma_get(dma_address, MM2S_STATUS_REGISTER);
+  while (!(mm2s_status & 1 << 12) || !(mm2s_status & 1 << 1)) {
+    msync(dma_address, PAGE_SIZE, MS_SYNC);
+    mm2s_status = dma_get(dma_address, MM2S_STATUS_REGISTER);
+  }
+}
+
+void dma::dma_s2mm_sync() {
+  msync(dma_address, PAGE_SIZE, MS_SYNC);
+  unsigned int s2mm_status = dma_get(dma_address, S2MM_STATUS_REGISTER);
+  while (!(s2mm_status & 1 << 12) || !(s2mm_status & 1 << 1)) {
+    msync(dma_address, PAGE_SIZE, MS_SYNC);
+    s2mm_status = dma_get(dma_address, S2MM_STATUS_REGISTER);
+  }
+}
+
+void dma::acc_init(unsigned int base_addr, int length) {
+  int dh = open("/dev/mem", O_RDWR | O_SYNC);
+  size_t virt_base = base_addr & ~(PAGE_SIZE - 1);
+  size_t virt_offset = base_addr - virt_base;
+  void *addr = mmap(NULL, length + virt_offset, PROT_READ | PROT_WRITE,
+                    MAP_SHARED, dh, virt_base);
+  close(dh);
+  if (addr == (void *)-1)
+    exit(EXIT_FAILURE);
+  acc_address = reinterpret_cast<unsigned int *>(addr);
+}
+
+void dma::dump_acc_signals(int state) {
+  msync(acc_address, PAGE_SIZE, MS_SYNC);
+  std::ofstream file;
+  file.open("dump_acc_signals.dat", std::ios_base::app);
+  file << "====================================================" << std::endl;
+  file << "State: " << state << std::endl;
+  file << "====================================================" << std::endl;
+  for (int i = 0; i < 16; i++)
+    file << acc_address[i] << ",";
+  file << "====================================================" << std::endl;
+}
\ No newline at end of file
diff --git a/llvm_plugins/lib/Transforms/PhismUtils/MemRefToArray.cc b/llvm_plugins/lib/Transforms/PhismUtils/MemRefToArray.cc
index bdf19ce..a398242 100644
--- a/llvm_plugins/lib/Transforms/PhismUtils/MemRefToArray.cc
+++ b/llvm_plugins/lib/Transforms/PhismUtils/MemRefToArray.cc
@@ -337,7 +337,8 @@ class InsExtSequence {
     lhs = rhs;
   }
 
-  template <typename T> void setMemberOnce(T *&lhs, T *rhs) {
+  template <typename T>
+  void setMemberOnce(T *&lhs, T *rhs) {
     assert(lhs == nullptr);
     lhs = rhs;
   }
@@ -608,7 +609,8 @@ static Instruction *duplicateGEPWithRankedArray(Instruction *I,
   IdxList.push_back(Addr);
 
   GetElementPtrInst *NewGEP = GetElementPtrInst::CreateInBounds(
-      RankedArrayPtr->getType()->getScalarType()->getPointerElementType(), RankedArrayPtr, IdxList, "gep" + Twine(NumNewGEP++), GEP->getNextNode());
+      RankedArrayPtr->getType()->getScalarType()->getPointerElementType(),
+      RankedArrayPtr, IdxList, "gep" + Twine(NumNewGEP++), GEP->getNextNode());
 
   return NewGEP;
 }
@@ -680,7 +682,7 @@ static SmallVector<Function *> TopologicalSort(ArrayRef<Function *> funcs) {
     graph[F] = {};
 
   for (Function *F : funcs)
-    for (BasicBlock &BB : F->getBasicBlockList())
+    for (BasicBlock &BB : *F)
       for (Instruction &I : BB)
         if (isa<CallInst>(I) &&
             Avail.count(cast<CallInst>(I).getCalledFunction()))
@@ -1030,8 +1032,9 @@ static void convertMemRefToArray(Module &M, bool ranked = false) {
         indices.push_back(ConstantInt::get(indices.front()->getType(), 0));
         std::reverse(indices.begin(), indices.end());
 
-        NewGEP = GetElementPtrInst::CreateInBounds(ptr->getType()->getScalarType()->getPointerElementType(), ptr, indices, Twine(""),
-                                                   I->getNextNode());
+        NewGEP = GetElementPtrInst::CreateInBounds(
+            ptr->getType()->getScalarType()->getPointerElementType(), ptr,
+            indices, Twine(""), I->getNextNode());
         LLVM_DEBUG({
           dbgs() << "Newly generated GEP: ";
           NewGEP->dump();
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index affa4d2..f9ac297 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -18,6 +18,7 @@ set(SODA_TEST_DEPENDS
         AllocaNamer
         XMLWriter
         VhlsLLVMRewriter
+        mlir_mockaxi_runner_utils
         )
 
 if(MLIR_ENABLE_BINDINGS_PYTHON)
diff --git a/test/Conversion/AccelToRunner/accel-to-runtime.mlir b/test/Conversion/AccelToRunner/accel-to-runtime.mlir
new file mode 100644
index 0000000..5cfd0ba
--- /dev/null
+++ b/test/Conversion/AccelToRunner/accel-to-runtime.mlir
@@ -0,0 +1,107 @@
+// RUN: soda-opt %s --test-accel-to-axi4mlir | FileCheck %s
+
+
+// CHECK: func.func private @dma_init
+// CHECK-NOT: func.func private @dma_init
+
+// CHECK: func.func private @dma_free
+// CHECK-NOT: func.func private @dma_free
+
+// CHECK-LABEL: test_init_dma
+// CHECK:         call @dma_init(%arg0
+// CHECK:         call @dma_free
+func.func @test_init_dma(
+  %dmaAddress : i32,
+  %dmaInputAddress : i32,
+  %dmaInputBufferSize : i32,
+  %dmaOutputAddress : i32,
+  %dmaOutputBufferSize : i32) {
+  accel.init_dma  %dmaAddress,
+                  %dmaInputAddress, %dmaInputBufferSize,
+                  %dmaOutputAddress, %dmaOutputBufferSize
+  : (i32, i32, i32, i32, i32)
+  return
+}
+
+// CHECK-LABEL: test_init_dma2
+// CHECK:         call @dma_init(%arg0
+// CHECK-NEXT:    call @dma_init(%arg1
+// CHECK-NEXT:    call @dma_init(%arg2
+// CHECK:         call @dma_free
+func.func @test_init_dma2(
+  %dmaAddress : i32,
+  %dmaAddress1 : i32,
+  %dmaAddress2 : i32,
+  %dmaInputAddress : i32,
+  %dmaInputBufferSize : i32,
+  %dmaOutputAddress : i32,
+  %dmaOutputBufferSize : i32) {
+  accel.init_dma  %dmaAddress,
+                  %dmaInputAddress, %dmaInputBufferSize,
+                  %dmaOutputAddress, %dmaOutputBufferSize
+  : (i32, i32, i32, i32, i32)
+  accel.init_dma  %dmaAddress1,
+                  %dmaInputAddress, %dmaInputBufferSize,
+                  %dmaOutputAddress, %dmaOutputBufferSize
+  : (i32, i32, i32, i32, i32)
+  accel.init_dma  %dmaAddress2,
+                  %dmaInputAddress, %dmaInputBufferSize,
+                  %dmaOutputAddress, %dmaOutputBufferSize
+  : (i32, i32, i32, i32, i32)
+  return
+}
+
+// CHECK-LABEL: test_send
+// CHECK:   %[[C0:.*]] = arith.constant 0
+// CHECK:   %[[CASTED:.*]] = memref.cast
+// CHECK:   call @copy_to_inbuffer_i32(%[[CASTED]], %[[C0]]) : (memref<*xi32>, i32) -> i32
+// CHECk:   call @dma_start_send
+// CHECK:   call @dma_wait_send
+func.func @test_send(%A: memref<60x80xi32>) -> i32 {
+  %offset = accel.send %A  : ( memref<60x80xi32> ) -> i32
+  return %offset : i32
+}
+
+// CHECK-LABEL: test_send_with_offset
+// CHECK:   %[[CASTED:.*]] = memref.cast
+// CHECK:   call @copy_to_inbuffer_i32(%[[CASTED]], %{{.*}}) : (memref<*xi32>, i32) -> i32
+// CHECK:   return %c4800
+func.func @test_send_with_offset(%A: memref<60x80xi32>, %offset0: i32) -> i32 {
+  %offset = accel.send %A, %offset0  : (memref<60x80xi32> , i32) -> i32
+  return %offset : i32
+}
+
+// CHECK-LABEL: test_send_with_subview
+// CHECK:   %[[CASTED:.*]] = memref.cast
+// CHECK:   call @copy_to_inbuffer_i32(%[[CASTED]], %{{.*}}) : (memref<*xi32>, i32) -> i32
+// CHECK:   return %c512
+#map = affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>
+func.func @test_send_with_subview(%input: memref<4x1024xi32>) -> i32 {
+  %cst_2 = arith.constant 2 : index
+  %0 = memref.subview %input[%cst_2, 256] [2, 256] [1, 1] : memref<4x1024xi32> to memref<2x256xi32, #map>
+  %offset = accel.send %0  : ( memref<2x256xi32, #map> ) -> i32
+  return %offset : i32
+}
+
+// CHECK-LABEL: test_sendLiteral
+// CHECK:   %[[INPUT:.*]]: i32)
+// CHECK:   %[[TMP:.*]] = memref.alloc() : memref<i32>
+// CHECK:   memref.store %[[INPUT]], %[[TMP]][] : memref<i32>
+// CHECK:   %[[CASTED:.*]] = memref.cast
+// CHECK:   call @copy_to_inbuffer_i32(%[[CASTED]], %{{.*}}) : (memref<*xi32>, i32) -> i32
+// CHECK:   memref.dealloc %[[TMP]] : memref<i32>
+// CHECK:   return %c1
+func.func @test_sendLiteral(%input: i32) -> i32 {
+  %offset = accel.sendLiteral %input  : ( i32 ) -> i32
+  return %offset : i32
+}
+
+// CHECK-LABEL: test_recv_with_offset
+// CHECK:   %[[CASTED:.*]] = memref.cast
+// CHECk:   call @dma_start_recv
+// CHECK:   call @dma_wait_recv
+// CHECK:   call @copy_from_outbuffer_i32(%[[CASTED]], %{{.*}}) : (memref<*xi32>, i32) -> i32
+func.func @test_recv_with_offset(%A: memref<60x80xi32>, %offset0: i32) -> i32 {
+  %offset = accel.recv %A, %offset0  : (memref<60x80xi32> , i32) -> i32
+  return %offset : i32
+}
diff --git a/test/Dialect/Accel/accel-dialect.mlir b/test/Dialect/Accel/accel-dialect.mlir
new file mode 100644
index 0000000..464c941
--- /dev/null
+++ b/test/Dialect/Accel/accel-dialect.mlir
@@ -0,0 +1,33 @@
+// RUN: soda-opt %s | FileCheck %s
+
+// CHECK-LABEL: test_init_dma
+func.func @test_init_dma(
+  %dmaAddress : i32,
+  %dmaInputAddress : i32,
+  %dmaInputBufferSize : i32,
+  %dmaOutputAddress : i32,
+  %dmaOutputBufferSize : i32) {
+  accel.init_dma  %dmaAddress,
+                  %dmaInputAddress, %dmaInputBufferSize,
+                  %dmaOutputAddress, %dmaOutputBufferSize
+  : (i32, i32, i32, i32, i32)
+  func.return
+}
+
+// CHECK-LABEL: test_send
+func.func @test_send(%A: memref<60x80xf32>) -> i32 {
+  %offset = accel.send %A  : (memref<60x80xf32>) -> i32
+  func.return %offset : i32
+}
+
+// CHECK-LABEL: test_send_with_offset
+func.func @test_send_with_offset(%A: memref<60x80xf32>, %offset0: i32) -> i32 {
+  %offset = accel.send %A, %offset0  : (memref<60x80xf32> , i32) -> i32
+  func.return %offset : i32
+}
+
+// CHECK-LABEL: test_recv_with_offset
+func.func @test_recv_with_offset(%A: memref<60x80xf32>, %offset0: i32) -> i32 {
+  %offset = accel.recv %A, %offset0  : (memref<60x80xf32> , i32) -> i32
+  func.return %offset : i32
+}
diff --git a/test/Dialect/Affine/fusion.mlir b/test/Dialect/Affine/fusion.mlir
index bc46e00..4809730 100644
--- a/test/Dialect/Affine/fusion.mlir
+++ b/test/Dialect/Affine/fusion.mlir
@@ -1,4 +1,4 @@
- // RUN: soda-opt -allow-unregistered-dialect %s -affine-loop-fusion="fusion-maximal" -split-input-file | FileCheck %s
+ // RUN: soda-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(affine-loop-fusion{fusion-maximal}))' -split-input-file | FileCheck %s
  
 func.func @fusion_with_arith(%arg0: memref<4x4xf32>, %arg1: memref<4x4xf32>, %arg2: memref<4x4xf32>) -> memref<4x4xf32> {
   %0 = memref.alloc() : memref<4x4xf32>
diff --git a/test/Dialect/Linalg/tile.mlir b/test/Dialect/Linalg/tile.mlir
index bb39fd1..900aba0 100644
--- a/test/Dialect/Linalg/tile.mlir
+++ b/test/Dialect/Linalg/tile.mlir
@@ -1,4 +1,5 @@
 // RUN: soda-opt %s -soda-linalg-tile="tile-sizes=2,4,8 anchor-op=linalg.matmul" -cse| FileCheck %s --check-prefix=TILE
+// RUN: soda-opt %s -soda-linalg-tile="tile-sizes=2,3 anchor-op=linalg.conv_2d" -cse| FileCheck %s --check-prefix=TILE_CONV
 
 
 // transform.sequence failures(propagate) {
@@ -39,4 +40,14 @@ func.func @linalg_generic(%in0t: tensor<4x4xf32>, %out0t: tensor<4xf32>) {
 }
 
 // TILE-LABEL:  func.func @linalg_generic
-// TILE-NOT:    scf.for
\ No newline at end of file
+// TILE-NOT:    scf.for
+
+func.func @conv(%arg0 : memref<?x?xf32>, %arg1 : memref<?x?xf32>, %arg2 : memref<?x?xf32>) {
+  linalg.conv_2d ins(%arg0, %arg1 : memref<?x?xf32>, memref<?x?xf32>) outs(%arg2 : memref<?x?xf32>)
+  return
+}
+
+// TILE_CONV: func @conv
+// TILE_CONV:   scf.for %{{.*}} = %{{.*}} to %{{.*}} step
+// TILE_CONV:     scf.for %{{.*}} = %{{.*}} to %{{.*}} step
+// TILE_CONV:       linalg.conv_2d
\ No newline at end of file
diff --git a/test/Dialect/Transform/transform-on-linalg.mlir b/test/Dialect/Transform/transform-on-linalg.mlir
index 1810d3d..eee102f 100644
--- a/test/Dialect/Transform/transform-on-linalg.mlir
+++ b/test/Dialect/Transform/transform-on-linalg.mlir
@@ -7,7 +7,7 @@
 transform.sequence failures(propagate) {
 ^bb0(%arg1: !pdl.operation):
   %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1
-  %1, %loops:3 = transform.structured.tile %0 [4, 4, 4]
+  %1, %loops:3 = transform.structured.tile %0 [4, 4, 4] : (!pdl.operation) -> (!pdl.operation, !pdl.operation, !pdl.operation, !pdl.operation)
 }
 
 // CHECK-TILE-LABEL: func @tile_linalg_matmul_on_tensors(
@@ -55,4 +55,4 @@ func.func @tile_linalg_matmul_on_memrefs(
 
 //      CHECK-TILE: return %[[TC]] : memref<128x128xf32>
   return %arg2 : memref<128x128xf32>
-}
\ No newline at end of file
+}
diff --git a/test/axi4mlir-runner/run-axi-v1-data-copy.mlir b/test/axi4mlir-runner/run-axi-v1-data-copy.mlir
new file mode 100644
index 0000000..f8c3d14
--- /dev/null
+++ b/test/axi4mlir-runner/run-axi-v1-data-copy.mlir
@@ -0,0 +1,149 @@
+// RUN: soda-opt %s \
+// RUN: -convert-linalg-to-loops -convert-scf-to-cf \
+// RUN: --canonicalize --cse  \
+// RUN: --convert-memref-to-llvm \
+// RUN: --convert-math-to-llvm --convert-math-to-libm \
+// RUN: -arith-expand   \
+// RUN: -memref-expand  \
+// RUN: --convert-arith-to-llvm \
+// RUN: --convert-func-to-llvm --reconcile-unrealized-casts | \
+// RUN: mlir-cpu-runner \
+// RUN:  -e main -entry-point-result=void \
+// RUN:  -shared-libs=%sodashlibdir/libmlir_mockaxi_runner_utils%shlibext \
+// RUN:  -shared-libs=%mlir_lib_dir/libmlir_runner_utils%shlibext | \
+// RUN: FileCheck %s
+
+// MLIR Runner
+func.func private @printMemrefF32(memref<*xf32>)
+
+// AXI4MLIR func.functions
+func.func private @dma_init(index, index, index, index, index) -> ()
+func.func private @dma_free() -> ()
+
+func.func private @mlir_dma_copy_to_inbuffer(memref<*xf32>, i64, i64) -> (i64)
+func.func private @mlir_dma_copy_from_outbuffer(memref<*xf32>, i64, i64) -> (i64)
+func.func private @copy_to_inbuffer_f32(memref<*xf32>, i64) -> (i64)
+func.func private @copy_from_outbuffer_f32(memref<*xf32>, i64) -> (i64)
+
+func.func private @dma_start_send(i64, i64) -> (i64)
+func.func private @dma_wait_send() -> ()
+
+func.func private @dma_start_recv(i64, i64) -> (i64)
+func.func private @dma_wait_recv() -> ()
+
+// Performing these C opertaions
+// dma1.dma_init(0,0,1000,0,1000);
+// dma1.dma_copy_to_inbuffer(reinterpret_cast<unsigned int*>(inputs),rows*depth,0);
+// dma1.dma_copy_to_inbuffer(reinterpret_cast<unsigned int*>(weightsT),depth*cols,rows*depth);
+// dma1.dma_start_send(dma1.current_input_offset,0);
+// dma1.dma_start_recv(rows*cols +1 ,0);
+// dma1.dma_wait_send();
+// dma1.dma_wait_recv();
+// dma1.dma_copy_from_outbuffer(reinterpret_cast<unsigned int*>(accelerated_outputs),cols*rows,0);
+
+func.func @alloc_2d_filled_f32(%s1 : index, %s2 : index, %f : f32) -> memref<?x?xf32> {
+  %buf = memref.alloc(%s1, %s2) : memref<?x?xf32>
+  linalg.fill ins(%f : f32) outs(%buf : memref<?x?xf32>)
+  
+  return %buf : memref<?x?xf32>
+}
+
+func.func @alloc_2d_filled_inc_f32(%arg0: index, %arg1: index, %arg2: f32) -> memref<?x?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %cst = arith.constant 1.000000e+02 : f32
+  %0 = memref.alloc(%arg0, %arg1) : memref<?x?xf32>
+  linalg.fill ins(%arg2 : f32) outs(%0 : memref<?x?xf32>)
+  scf.for %arg3 = %c0 to %arg0 step %c1 {
+    scf.for %arg4 = %c0 to %arg1 step %c1 {
+      %1 = arith.index_cast %arg3 : index to i32
+      %2 = arith.index_cast %arg4 : index to i32
+      %3 = arith.sitofp %1 : i32 to f32
+      %4 = arith.sitofp %2 : i32 to f32
+      %5 = arith.mulf %3, %cst : f32
+      %6 = arith.addf %4, %5 : f32
+      memref.store %6, %0[%arg3, %arg4] : memref<?x?xf32>
+    }
+  }
+  return %0 : memref<?x?xf32>
+}
+
+func.func @main() {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %c4 = arith.constant 4 : index
+  %c8 = arith.constant 8 : index
+  %c16 = arith.constant 16 : index
+  %c32 = arith.constant 32 : index
+  %c1000 = arith.constant 1000 : index
+
+  // Prepare tile sizes
+  %ts_a1 = arith.constant 4 : i64
+  %ts_a2 = arith.constant 4 : i64
+  %ts_o1 = arith.constant 4 : i64
+  %ts_o2 = arith.constant 4 : i64
+
+
+  %c1_0 = arith.constant 1 : i64
+  %cst_1 = arith.constant 1.000000e+00 : f32
+  %cst_0 = arith.constant 0.000000e+00 : f32
+
+
+  %A = call @alloc_2d_filled_inc_f32(%c4, %c4, %cst_1) : (index, index, f32) -> (memref<?x?xf32>)
+  %B = call @alloc_2d_filled_f32(%c4, %c4, %cst_1) : (index, index, f32) -> (memref<?x?xf32>)
+  %C = call @alloc_2d_filled_f32(%c4, %c4, %cst_0) : (index, index, f32) -> (memref<?x?xf32>)
+  
+  %A_typed = memref.cast %A: memref<?x?xf32> to memref<4x4xf32>
+  %B_typed = memref.cast %B: memref<?x?xf32> to memref<4x4xf32>
+  %C_typed = memref.cast %C: memref<?x?xf32> to memref<4x4xf32>
+
+  %in1 = memref.cast %A_typed: memref<4x4xf32> to memref<*xf32>
+  %in2 = memref.cast %B_typed: memref<4x4xf32> to memref<*xf32>
+  %out1 = memref.cast %C_typed: memref<4x4xf32> to memref<*xf32>
+
+
+  call @printMemrefF32(%in1) : (memref<*xf32>) -> ()
+  call @printMemrefF32(%in2) : (memref<*xf32>) -> ()
+
+  // Initializes the DMA
+  call @dma_init(%c0, %c0, %c1000, %c0, %c1000) : (index,index,index,index,index ) -> ()
+  
+  // Sizes of in and out buffers
+  %in1_lenght = arith.muli %ts_a1, %ts_a2 : i64
+  %in2_lenght = arith.muli %ts_a1, %ts_a2 : i64
+  %total_input_lenght = arith.addi %in1_lenght, %in2_lenght : i64
+  %out_lenght = arith.muli %ts_o1, %ts_o2 : i64
+  
+  %in1_offset = arith.constant 0 : i64  // offset on the input buffer
+  %in2_offset = arith.muli %c1_0, %in1_lenght : i64  // offset on the input buffer
+  %out_offset = arith.constant 0 : i64 // offset on the output buffer
+
+  // Copy data to be transfered and set the transfer size
+  call @copy_to_inbuffer_f32(%in1, %in1_offset) : (memref<*xf32>, i64) -> (i64)
+  call @copy_to_inbuffer_f32(%in2, %in2_offset) : (memref<*xf32>, i64) -> (i64)
+  call @dma_start_send (%total_input_lenght, %in1_offset) : (i64, i64) -> (i64)
+  call @dma_start_recv (%out_lenght, %out_offset) : (i64, i64) -> (i64)
+
+  // Wait for operations to complete
+  call @dma_wait_send () : () -> ()
+  call @dma_wait_recv () : () -> ()
+  
+
+  // Copy C tile from DMA output buffer
+  call @copy_from_outbuffer_f32 (%out1, %in2_offset) : (memref<*xf32>, i64) -> (i64)
+
+  // Cleanup
+  call @dma_free() : () -> ()
+
+  // Print output
+  call @printMemrefF32(%out1) : (memref<*xf32>) -> ()
+  return
+}
+
+//CHECK: dma_init
+//CHECK: dma_start_send
+//CHECK: dma_start_recv
+//CHECK: dma_wait_send
+//CHECK: dma_wait_recv
+//CHECK: dma_free
\ No newline at end of file
diff --git a/test/lit.cfg.py b/test/lit.cfg.py
index 3c756b2..3ea3f33 100644
--- a/test/lit.cfg.py
+++ b/test/lit.cfg.py
@@ -32,6 +32,7 @@
 config.substitutions.append(('%PATH%', config.environment['PATH']))
 config.substitutions.append(('%sodashlibdir', config.soda_lib_root))
 config.substitutions.append(('%shlibext', config.llvm_shlib_ext))
+config.substitutions.append(('%mlir_lib_dir', config.mlir_lib_root))
 
 llvm_config.with_system_environment(
     ['HOME', 'INCLUDE', 'LIB', 'TMP', 'TEMP'])
@@ -59,10 +60,15 @@
     'soda-translate',
     'soda-capi-test',
     'mlir-runner',
+    'mlir-cpu-runner',
     'opt',
     ToolSubst('%PYTHON', config.python_executable, unresolved='ignore'),
 ]
 
+print("==========")
+print(config.llvm_tools_dir)
+print("==========")
+
 llvm_config.add_tool_substitutions(tools, tool_dirs)
 
 llvm_config.with_environment('PYTHONPATH', [
diff --git a/test/lit.site.cfg.py.in b/test/lit.site.cfg.py.in
index e8b937a..e7253da 100644
--- a/test/lit.site.cfg.py.in
+++ b/test/lit.site.cfg.py.in
@@ -33,6 +33,7 @@ config.llvm_host_triple = '@LLVM_HOST_TRIPLE@'
 config.host_arch = "@HOST_ARCH@"
 config.mlir_src_root = "@MLIR_SOURCE_DIR@"
 config.mlir_obj_root = "@MLIR_BINARY_DIR@"
+config.mlir_lib_root = "@LLVM_LIBRARY_DIR@"
 config.mlir_tools_dir = "@MLIR_TOOLS_DIR@"
 config.soda_src_root = "@CMAKE_SOURCE_DIR@"
 config.soda_obj_root = "@CMAKE_BINARY_DIR@"
diff --git a/test/llvm_plugin/vhls-rewriter.ll b/test/llvm_plugin/vhls-rewriter.ll
index 72771fa..c25b520 100644
--- a/test/llvm_plugin/vhls-rewriter.ll
+++ b/test/llvm_plugin/vhls-rewriter.ll
@@ -2,7 +2,7 @@
 ; RUN:     -xlntbgen -xlntbdummynames="gemm.dummy.c" -xlntbtclnames="gemm.run.tcl" \
 ; RUN:     -xlnllvm="test.ll" -xlnpath=test_path \
 ; RUN:     -clock-period-ns=10 -target=test_board \
-; RUN:     -S -enable-new-pm=0 < %s 2>&1 | FileCheck %s
+; RUN:     -S -enable-new-pm=0 -opaque-pointers=0 < %s 2>&1 | FileCheck %s
 ; RUN: FileCheck %s -input-file=gemm.run.tcl --check-prefixes=CHECK_TCL
 ; RUN: FileCheck %s -input-file=gemm.dummy.c --check-prefixes=CHECK_TB
 
diff --git a/test/soda-opt/soda-opt.mlir b/test/soda-opt/soda-opt.mlir
index a6dc1e5..2ed377c 100644
--- a/test/soda-opt/soda-opt.mlir
+++ b/test/soda-opt/soda-opt.mlir
@@ -1,6 +1,7 @@
 // RUN: soda-opt --show-dialects | FileCheck %s
 // RUN: soda-opt --h | FileCheck %s -check-prefix=CHECKHELP
 //
+// CHECK: accel
 // CHECK: affine
 // CHECK: arith
 // CHECK: builtin
diff --git a/tools/soda-opt/CMakeLists.txt b/tools/soda-opt/CMakeLists.txt
index 9aab828..d4d7212 100644
--- a/tools/soda-opt/CMakeLists.txt
+++ b/tools/soda-opt/CMakeLists.txt
@@ -72,6 +72,7 @@ set(LIBS
   # Conversion SODA
   SODAKERNELToSODA
   SODAFuncToLLVM
+  # SODALinalgToAccel
 
   # SODA
   SODALinalgTransforms
@@ -85,6 +86,10 @@ set(LIBS
   # SNN
   MLIRSNNOps
   MLIRSNNTransforms
+
+  # ACCEL
+  SODAAccelDialect
+  SODAAccelToRuntime
 )
 
 set(SOURCES
diff --git a/tools/soda-opt/soda-opt.cpp b/tools/soda-opt/soda-opt.cpp
index feb5b1e..ba2b7f2 100644
--- a/tools/soda-opt/soda-opt.cpp
+++ b/tools/soda-opt/soda-opt.cpp
@@ -15,6 +15,7 @@
 #include "llvm/Support/ToolOutputFile.h"
 
 #include "soda/Conversion/Passes.h"
+#include "soda/Dialect/Accel/IR/Accel.h"
 #include "soda/Dialect/Linalg/Reports/Passes.h"
 #include "soda/Dialect/Linalg/Transforms/Passes.h"
 #include "soda/Dialect/SNN/IR/SNN.h"
@@ -123,8 +124,8 @@ int main(int argc, char **argv) {
   //===--------------------------------------------------------------------===//
 
   // Dialects
-  registry.insert<mlir::soda::SODADialect>();
-  registry.insert<mlir::snn::SNNDialect>();
+  registry.insert<mlir::soda::SODADialect, mlir::snn::SNNDialect,
+                  mlir::accel::AccelDialect>();
 
   // ----- SODA -----
   // Misc passes
@@ -163,6 +164,7 @@ int main(int argc, char **argv) {
   mlir::soda::registerOptimizedForVitisHLSPass();
 
   // Conversion passes
+  mlir::soda::registerConvertAccelToAXI4MLIR();
 
   // ----- SNN -----
   mlir::snn::registerSNNPrintPass();