diff --git a/.editorconfig b/.editorconfig
index 7242dd283c..71bcacde7d 100644
--- a/.editorconfig
+++ b/.editorconfig
@@ -20,3 +20,7 @@ indent_size = 4
 [*.toml]
 indent_style = space
 indent_size = 2
+
+[*.md]
+indent_style = space
+indent_size = 2
diff --git a/.flake/pkgs/ffdb/default.nix b/.flake/pkgs/ffdb/default.nix
new file mode 100644
index 0000000000..8e3989372a
--- /dev/null
+++ b/.flake/pkgs/ffdb/default.nix
@@ -0,0 +1,40 @@
+{ lib
+, stdenv
+, makeWrapper
+, gdb
+, python3
+, proj
+}:
+
+stdenv.mkDerivation rec {
+  pname = "ffdb";
+  version = "0.1";
+
+  pythonPath = with python3.pkgs; makePythonPath [
+    proj
+  ];
+
+  dontBuild = true;
+
+  nativeBuildInputs = [ makeWrapper ];
+
+  src = ./.;
+
+  installPhase = ''
+    mkdir -p $out/share/ffdb
+    cp ffdb.py $out/share/ffdb
+    makeWrapper ${gdb}/bin/gdb $out/bin/gdb \
+      --add-flags "-q -x $out/share/ffdb/ffdb.py" \
+      --set NIX_PYTHONPATH ${pythonPath} \
+      --prefix PATH : ${lib.makeBinPath [
+        python3
+      ]}
+    cp $out/bin/gdb $out/bin/ffdb
+  '';
+
+  nativeCheckInputs = [
+    gdb
+    python3
+    proj
+  ];
+}
diff --git a/.flake/pkgs/ffdb/ffdb.py b/.flake/pkgs/ffdb/ffdb.py
new file mode 100644
index 0000000000..84354ccd82
--- /dev/null
+++ b/.flake/pkgs/ffdb/ffdb.py
@@ -0,0 +1,7 @@
+from proj.config_file import get_config_root
+from pathlib import Path
+import gdb
+
+gdb.execute(f'directory {get_config_root(Path.cwd())}')
+gdb.prompt_hook = lambda x: '(ffdb) '
+gdb.execute('set history save on')
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index 183028b022..684c63542e 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -9,7 +9,3 @@ Linked Issues:
 
 Issues closed by this PR:
 - Closes #
-
-**Before merging:**
-
-- [ ] Did you update the [flexflow-third-party](https://github.com/flexflow/flexflow-third-party) repo, if modifying any of the Cmake files, the build configs, or the submodules?
diff --git a/.github/workflows/clang-format-check.yml b/.github/workflows/clang-format-check.yml
deleted file mode 100644
index 672644388c..0000000000
--- a/.github/workflows/clang-format-check.yml
+++ /dev/null
@@ -1,24 +0,0 @@
-name: clang-format Check
-on: [push, pull_request, workflow_dispatch]
-jobs:
-  formatting-check:
-    name: Formatting Check
-    runs-on: ubuntu-latest
-    strategy:
-      fail-fast: true
-      matrix:
-        path:
-          - check: "lib"
-          - check: "tests"
-          - check: "examples"
-          - check: "bindings"
-          - check: "bin"
-            exclude: '\.proto$'
-    steps:
-      - uses: actions/checkout@v2
-      - name: Run clang-format style check for C/C++/Protobuf programs.
-        uses: lockshaw/clang-format-action@v4.11.0-flexflow-3
-        with:
-          clang-format-version: "16"
-          check-path: ${{ matrix.path['check'] }}
-          exclude-regex: ${{ matrix.path['exclude'] }}
diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml
new file mode 100644
index 0000000000..88f98b6e9a
--- /dev/null
+++ b/.github/workflows/clang-format.yml
@@ -0,0 +1,13 @@
+name: clang-format
+on: [push, pull_request, workflow_dispatch]
+jobs:
+  formatting-check:
+    name: Formatting Check
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - name: Run clang-format style check for C/C++/Protobuf programs.
+        uses: lockshaw/clang-format-action@v4.11.0-flexflow-3
+        with:
+          clang-format-version: "16"
+          exclude-regex: '\.proto$'
diff --git a/.github/workflows/helpers/build_target.sh b/.github/workflows/helpers/build_target.sh
deleted file mode 100755
index cc4e25cc0b..0000000000
--- a/.github/workflows/helpers/build_target.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#! /usr/bin/env bash
-
-set -euo pipefail
-
-DIR="$(realpath -- "$(dirname "${BASH_SOURCE[0]}")")"
-REPO="$(realpath -- "$DIR/../../../")"
-
-cd "$REPO/build-ci"
-make -j $(( $(nproc) < 2 ? 1 : $(nproc)-1 )) "$@"
diff --git a/.github/workflows/helpers/cmake_cuda.sh b/.github/workflows/helpers/cmake_cuda.sh
deleted file mode 100755
index f062569efb..0000000000
--- a/.github/workflows/helpers/cmake_cuda.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#! /usr/bin/env bash
-
-set -euo pipefail
-set -x
-
-DIR="$(realpath -- "$(dirname "${BASH_SOURCE[0]}")")"
-REPO="$(realpath -- "$DIR/../../../")"
-
-export FF_GPU_BACKEND="cuda"
-export FF_CUDA_ARCH=70
-
-if [[ -d "$REPO/build-ci" ]]; then 
-  rm -rf "$REPO/build-ci"
-fi
-mkdir "$REPO/build-ci"
-cd "$REPO/build-ci"
-#if [[ "${FF_GPU_BACKEND}" == "cuda" ]]; then
-#  export FF_BUILD_ALL_EXAMPLES=ON
-#  export FF_BUILD_UNIT_TESTS=ON
-#fi
-IFS=" " read -r -a FLAGS <<< "$CMAKE_FLAGS"
-../config/config.linux \
-        -DCMAKE_C_COMPILER_LAUNCHER=ccache \
-        -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-        -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \
-        -DFF_USE_CODE_COVERAGE=ON \
-        "${FLAGS[@]}"
-
-# vim: set tabstop=2 shiftwidth=2 expandtab:
diff --git a/.github/workflows/helpers/free_space_on_runner_gpu.sh b/.github/workflows/helpers/free_space_on_runner_gpu.sh
new file mode 100755
index 0000000000..a382ee58f6
--- /dev/null
+++ b/.github/workflows/helpers/free_space_on_runner_gpu.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+set -euo pipefail
+set -x
+
+sudo rm -rf /usr/share/dotnet
+sudo rm -rf /usr/local/lib/android
+sudo rm -rf /opt/ghc
+sudo rm -rf "/usr/local/share/boost"
diff --git a/.github/workflows/helpers/gpu_ci_helper.py b/.github/workflows/helpers/gpu_ci_helper.py
deleted file mode 100644
index c29994795f..0000000000
--- a/.github/workflows/helpers/gpu_ci_helper.py
+++ /dev/null
@@ -1,85 +0,0 @@
-#!/usr/bin/env python3
-
-from github import Github
-import os, sys, argparse, time
-
-
-def get_num_workflow_runs(repo, workflow_names, in_progress_only=False):
-    workflows = [
-        w for w in repo.get_workflows() for w_name in workflow_names if w.path == w_name
-    ]
-    if len(workflows) != len(workflow_names):
-        print(
-            f"Found {len(workflows)} workflows instead of {len(workflow_names)}. Weird."
-        )
-        sys.exit(1)
-    count = 0
-    for workflow in workflows:
-        running_states = (
-            ["in_progress"] if in_progress_only else ["queued", "in_progress"]
-        )
-        runs = [
-            run for status in running_states for run in workflow.get_runs(status=status)
-        ]
-        count += len(runs)
-    return count
-
-
-if __name__ == "__main__":
-
-    # Check who is running this script (the daemon or a regular gpu-ci runner)
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--daemon", action="store_true")
-    args = parser.parse_args()
-
-    # Log into the GitHub API and get a handle to the repo
-    git_token = os.getenv("FLEXFLOW_TOKEN") or ""
-    if len(git_token) < 40:
-        print("FLEXFLOW_TOKEN not set properly")
-        sys.exit(1)
-    git_client = Github(git_token)
-    if not git_client:
-        print("Could not get a Git client")
-        sys.exit(1)
-    repo = git_client.get_repo("flexflow/FlexFlow")
-    if not repo:
-        print("Could not access the FlexFlow repo")
-        sys.exit(1)
-
-    if args.daemon:
-        print("Running the daemon...")
-        # Check if there is any `gpu-ci` workflow in progress or queued
-        target_workflows = [
-            ".github/workflows/gpu-ci.yml",
-            ".github/workflows/multinode-test.yml",
-        ]
-        n = get_num_workflow_runs(repo, target_workflows, in_progress_only=False)
-        print(f"Detected {n} GPU-related workflow runs in progress or queued")
-
-        instance_id = os.getenv("FLEXFLOW_RUNNER_INSTANCE_ID") or ""
-        if len(instance_id) != 19:
-            print("FLEXFLOW_RUNNER_INSTANCE_ID not set properly")
-            sys.exit(1)
-        # If there are `gpu-ci` runs in progress or queued, turn on the `flexflow-runner` spot instance,
-        # if it is not already on. If there are no `gpu-ci` runs in progress or queued, turn off
-        # the spot instance if it is not already off.
-        if n > 0:
-            print("Starting the `flexflow-runner` spot instance (if not already on)...")
-            os.system(
-                f"aws ec2 start-instances --region us-east-2 --instance-ids {instance_id}"
-            )
-        else:
-            print(
-                "Stopping the `flexflow-runner` spot instance (if not already off)..."
-            )
-            os.system(
-                f"aws ec2 stop-instances --region us-east-2 --instance-ids {instance_id}"
-            )
-    else:
-        print("Waiting for the deamon to finish running...")
-        # Wait until the daemon has finished running
-        target_workflow = [".github/workflows/gpu-ci-daemon.yml"]
-        n = get_num_workflow_runs(repo, target_workflow, in_progress_only=True)
-        while n > 0:
-            time.sleep(30)
-            n = get_num_workflow_runs(repo, target_workflow, in_progress_only=True)
diff --git a/.github/workflows/helpers/install_cudnn.sh b/.github/workflows/helpers/install_cudnn.sh
deleted file mode 100755
index d77745451b..0000000000
--- a/.github/workflows/helpers/install_cudnn.sh
+++ /dev/null
@@ -1,56 +0,0 @@
-#!/bin/bash
-set -euo pipefail
-set -x
-
-# Cd into directory holding this script
-cd "${BASH_SOURCE[0]%/*}"
-
-# Install CUDNN
-cuda_version=${1:-11.1.1}
-cuda_version=$(echo "${cuda_version}" | cut -f1,2 -d'.')
-echo "Installing CUDNN for CUDA version: ${cuda_version} ..."
-CUDNN_LINK=http://developer.download.nvidia.com/compute/redist/cudnn/v8.0.5/cudnn-11.1-linux-x64-v8.0.5.39.tgz
-CUDNN_TARBALL_NAME=cudnn-11.1-linux-x64-v8.0.5.39.tgz
-if [[ "$cuda_version" == "10.1" ]]; then
-    CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.0.5/cudnn-10.1-linux-x64-v8.0.5.39.tgz
-    CUDNN_TARBALL_NAME=cudnn-10.1-linux-x64-v8.0.5.39.tgz
-elif [[ "$cuda_version" == "10.2" ]]; then
-    CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.0.5/cudnn-10.2-linux-x64-v8.0.5.39.tgz
-    CUDNN_TARBALL_NAME=cudnn-10.2-linux-x64-v8.0.5.39.tgz
-elif [[ "$cuda_version" == "11.0" ]]; then
-    CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.0.5/cudnn-11.0-linux-x64-v8.0.5.39.tgz
-    CUDNN_TARBALL_NAME=cudnn-11.0-linux-x64-v8.0.5.39.tgz
-elif [[ "$cuda_version" == "11.1" ]]; then
-    CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.0.5/cudnn-11.1-linux-x64-v8.0.5.39.tgz
-    CUDNN_TARBALL_NAME=cudnn-11.1-linux-x64-v8.0.5.39.tgz
-elif [[ "$cuda_version" == "11.2" ]]; then
-    CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.1.1/cudnn-11.2-linux-x64-v8.1.1.33.tgz
-    CUDNN_TARBALL_NAME=cudnn-11.2-linux-x64-v8.1.1.33.tgz
-elif [[ "$cuda_version" == "11.3" ]]; then
-    CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.2.1/cudnn-11.3-linux-x64-v8.2.1.32.tgz
-    CUDNN_TARBALL_NAME=cudnn-11.3-linux-x64-v8.2.1.32.tgz
-elif [[ "$cuda_version" == "11.4" ]]; then
-    CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.2.4/cudnn-11.4-linux-x64-v8.2.4.15.tgz
-    CUDNN_TARBALL_NAME=cudnn-11.4-linux-x64-v8.2.4.15.tgz
-elif [[ "$cuda_version" == "11.5" ]]; then
-    CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.0/cudnn-11.5-linux-x64-v8.3.0.98.tgz
-    CUDNN_TARBALL_NAME=cudnn-11.5-linux-x64-v8.3.0.98.tgz
-elif [[ "$cuda_version" == "11.6" ]]; then
-    CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.4.0/local_installers/11.6/cudnn-linux-x86_64-8.4.0.27_cuda11.6-archive.tar.xz
-    CUDNN_TARBALL_NAME=cudnn-linux-x86_64-8.4.0.27_cuda11.6-archive.tar.xz
-elif [[ "$cuda_version" == "11.7" ]]; then
-    CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.5.0/local_installers/11.7/cudnn-linux-x86_64-8.5.0.96_cuda11-archive.tar.xz
-    CUDNN_TARBALL_NAME=cudnn-linux-x86_64-8.5.0.96_cuda11-archive.tar.xz
-fi
-wget -c -q $CUDNN_LINK
-if [[ "$cuda_version" == "11.6" || "$cuda_version" == "11.7" ]]; then
-    tar -xf $CUDNN_TARBALL_NAME -C ./
-    CUDNN_EXTRACTED_TARBALL_NAME="${CUDNN_TARBALL_NAME::-7}"
-    sudo cp -r "$CUDNN_EXTRACTED_TARBALL_NAME/include/*" "/usr/local/include"
-    sudo cp -r "$CUDNN_EXTRACTED_TARBALL_NAME/lib/*" "/usr/local/lib"
-    rm -rf "$CUDNN_EXTRACTED_TARBALL_NAME"
-else
-    sudo tar -xzf $CUDNN_TARBALL_NAME -C /usr/local
-fi
-rm $CUDNN_TARBALL_NAME 
-sudo ldconfig
diff --git a/.github/workflows/helpers/install_dependencies.sh b/.github/workflows/helpers/install_dependencies.sh
deleted file mode 100755
index 5ab211c962..0000000000
--- a/.github/workflows/helpers/install_dependencies.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/bin/bash
-set -euo pipefail
-set -x
-
-# Cd into directory holding this script
-cd "${BASH_SOURCE[0]%/*}"
-
-# General dependencies
-echo "Installing apt dependencies..."
-sudo apt-get update && sudo apt-get install -y --no-install-recommends wget binutils git zlib1g-dev libhdf5-dev && \
-    sudo rm -rf /var/lib/apt/lists/*
-
-# Install CUDNN
-./install_cudnn.sh
-
-# Install HIP dependencies if needed
-FF_GPU_BACKEND=${FF_GPU_BACKEND:-"cuda"}
-if [[ "${FF_GPU_BACKEND}" != @(cuda|hip_cuda|hip_rocm|intel) ]]; then
-  echo "Error, value of FF_GPU_BACKEND (${FF_GPU_BACKEND}) is invalid."
-  exit 1
-elif [[ "$FF_GPU_BACKEND" == "hip_cuda" || "$FF_GPU_BACKEND" = "hip_rocm" ]]; then
-    echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Installing HIP dependencies"
-    wget https://repo.radeon.com/amdgpu-install/22.20.5/ubuntu/focal/amdgpu-install_22.20.50205-1_all.deb
-    sudo apt-get install -y ./amdgpu-install_22.20.50205-1_all.deb
-    rm ./amdgpu-install_22.20.50205-1_all.deb
-    sudo amdgpu-install -y --usecase=hip,rocm --no-dkms
-    sudo apt-get install -y hip-dev hipblas miopen-hip rocm-hip-sdk
-else
-    echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Skipping installing HIP dependencies"
-fi
-sudo rm -rf /var/lib/apt/lists/*
diff --git a/.github/workflows/helpers/test_target.sh b/.github/workflows/helpers/test_target.sh
deleted file mode 100755
index 69baa66364..0000000000
--- a/.github/workflows/helpers/test_target.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#! /usr/bin/env bash
-
-set -euo pipefail
-set -x
-
-DIR="$(realpath -- "$(dirname "${BASH_SOURCE[0]}")")"
-REPO="$(realpath -- "$DIR/../../../")"
-
-TEST_LIBS=("${@/%/-tests}")
-REGEX="^($(IFS='|'; echo "${TEST_LIBS[*]}"))\$"
-
-cd "$REPO/build-ci"
-make -j $(( $(nproc) < 2 ? 1 : $(nproc)-1 )) "${TEST_LIBS[@]}"
-ctest --progress --output-on-failure -L "$REGEX"
diff --git a/.github/workflows/per-lib-check.yml b/.github/workflows/per-lib-check.yml
deleted file mode 100644
index b54ef25819..0000000000
--- a/.github/workflows/per-lib-check.yml
+++ /dev/null
@@ -1,158 +0,0 @@
-name: "per-lib-checks"
-on: [push, pull_request, workflow_dispatch]
-concurrency:
-  group: build-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  cmake-build:
-    name: Library CMake Build
-    runs-on: ubuntu-20.04
-
-    strategy:
-      max-parallel: 1
-      matrix:
-        gpu_backend: ["cuda"]
-      fail-fast: false
-    steps:
-      - name: Checkout Git Repository
-        uses: actions/checkout@v3
-        with:
-          submodules: recursive
-
-      - name: Add helpers directory to path
-        run: echo "${PWD}/.github/workflows/helpers" >> $GITHUB_PATH
-
-      - name: Free additional space on runner
-        run: free_space_on_runner.sh
-
-      - name: Install nix
-        uses: cachix/install-nix-action@v25
-        with:
-          github_access_token: '${{ secrets.GITHUB_TOKEN }}'
-
-      - uses: cachix/cachix-action@v14
-        with:
-          name: ff
-          skipPush: true
-          # authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
-
-      - name: setup nix develop shell
-        uses: nicknovitski/nix-develop@v1.1.0
-        with:
-          arguments: "--accept-flake-config .#ci"
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2
-
-      # - name: Install system dependencies
-      #   run: FF_GPU_BACKEND=${{ matrix.gpu_backend }} .github/workflows/helpers/install_dependencies.sh
-
-      # - name: Install conda and FlexFlow dependencies
-      #   uses: conda-incubator/setup-miniconda@v2
-      #   with:
-      #     activate-environment: flexflow
-      #     environment-file: packaging/conda/environment.yml
-      #     auto-activate-base: false
-
-      - name: Regenerate all dtgen files
-        run: |
-          proj dtgen --force
-
-      - name: Run cmake
-        run: |
-          cmake_${{ matrix.gpu_backend }}.sh
-
-      - name: Build utils
-        run: |
-          build_target.sh utils
-
-      - name: Build op-attrs
-        run: |
-          build_target.sh op-attrs
-
-      - name: Build pcg
-        run: |
-          build_target.sh pcg
-
-      - name: Build kernels
-        run: |
-          build_target.sh kernels
-
-      - name: Build substitutions
-        run: |
-          build_target.sh substitutions
-
-      - name: Build compiler
-        run: |
-          build_target.sh compiler
-
-      - name: Build substitution-generator
-        run: |
-          build_target.sh substitution-generator
-
-      - name: Build local-execution
-        run: |
-          build_target.sh local-execution
-
-      - name: Build models
-        run: |
-          build_target.sh models
-
-      - name: Build substitution-to-dot
-        run: |
-          build_target.sh substitution-to-dot
-
-      - name: Build export-model-arch
-        run: |
-          build_target.sh export-model-arch
-
-      - name: Test utils
-        run: |
-          test_target.sh utils
-
-      - name: Test op-attrs
-        run: |
-          test_target.sh op-attrs
-
-      - name: Test pcg
-        run: |
-          test_target.sh pcg
-
-      - name: Test substitutions
-        run: |
-          test_target.sh substitutions
-
-      - name: Test compiler
-        run: |
-          test_target.sh compiler
-
-      - name: Test substitution-generator
-        run: |
-          test_target.sh substitution-generator
-
-      - name: Test local-execution
-        run: |
-          test_target.sh local-execution
-
-      - name: Test models
-        run: |
-          test_target.sh models
-
-      - name: Generate code coverage
-        run: |
-          echo "gitwork: $GITHUB_WORKSPACE"
-          lcov --capture --directory . --output-file main_coverage.info
-          lcov --extract main_coverage.info "$GITHUB_WORKSPACE/lib/*" --output-file main_coverage.info
-          lcov --remove main_coverage.info "$GITHUB_WORKSPACE/lib/*.dtg.h" "$GITHUB_WORKSPACE/lib/*.dtg.cc"  --output-file main_coverage.info
-          lcov --list main_coverage.info
-      
-      - name: Upload code coverage
-        uses: codecov/codecov-action@v4
-        with:
-          token: ${{ secrets.CODECOV_TOKEN }}
-          file: main_coverage.info
-          flags: unittests
-          name: codecov-umbrella
-          fail_ci_if_error: false
-          verbose: true
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
new file mode 100644
index 0000000000..7e2dabd784
--- /dev/null
+++ b/.github/workflows/tests.yml
@@ -0,0 +1,116 @@
+name: "tests"
+on: [push, pull_request, workflow_dispatch]
+concurrency:
+  group: build-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  cpu-ci:
+    name: CPU unit tests and build
+    runs-on: ubuntu-20.04
+
+    steps:
+      - name: Checkout Git Repository
+        uses: actions/checkout@v3
+        with:
+          submodules: recursive
+
+      - name: Free additional space on runner
+        run: ./.github/workflows/helpers/free_space_on_runner_gpu.sh
+
+      - name: Install nix
+        uses: cachix/install-nix-action@v25
+        with:
+          github_access_token: '${{ secrets.GITHUB_TOKEN }}'
+
+      - uses: cachix/cachix-action@v14
+        with:
+          name: ff
+          skipPush: true
+          # authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
+
+      - name: setup nix develop shell
+        uses: nicknovitski/nix-develop@v1.1.0
+        env:
+          NIXPKGS_ALLOW_UNFREE: 1
+        with:
+          arguments: ".#ci --accept-flake-config"
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2
+
+      - name: Regenerate all dtgen files
+        run: |
+          proj dtgen --force
+
+      - name: Run cmake
+        run: |
+          proj cmake --dtgen-skip
+
+      - name: Run build and tests
+        run: |
+          proj test --dtgen-skip -j$(nproc) --coverage --skip-gpu-tests
+
+      - name: Upload code coverage
+        uses: codecov/codecov-action@v4
+        with:
+          token: ${{ secrets.CODECOV_TOKEN }}
+          file: main_coverage.info
+          flags: unittests
+          name: codecov-umbrella
+          fail_ci_if_error: false
+          verbose: true
+
+  gpu-ci:
+    name: GPU unit tests
+    needs: cpu-ci
+    runs-on:
+      - runs-on
+      - family=g4dn.xlarge
+      - image=ubuntu22-full-x64
+
+    strategy:
+      max-parallel: 1
+      fail-fast: false
+
+    steps:
+      - name: checkout git repository
+        uses: actions/checkout@v3
+        with:
+          submodules: recursive
+
+      - name: free additional space on runner
+        run: ./.github/workflows/helpers/free_space_on_runner_gpu.sh
+
+      - name: install nix
+        uses: cachix/install-nix-action@v25
+        with:
+          github_access_token: '${{ secrets.GITHUB_TOKEN }}'
+
+      - uses: cachix/cachix-action@v14
+        with:
+          name: ff
+          skipPush: true
+          # authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
+
+      - name: setup nix develop shell
+        uses: nicknovitski/nix-develop@v1.1.0
+        env:
+          NIXPKGS_ALLOW_UNFREE: 1
+        with:
+          arguments: ".#gpu-ci --accept-flake-config --impure"
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2
+
+      - name: regenerate all dtgen files
+        run: |
+          proj dtgen --force
+
+      - name: run cmake
+        run: |
+          proj cmake --dtgen-skip
+
+      - name: build and run gpu tests
+        run: |
+          proj test --dtgen-skip -j$(nproc) --skip-build-cpu-tests
diff --git a/.gitignore b/.gitignore
index 397ac0974d..4b40a016df 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,6 @@
+# gdb history
+.gdb_history
+
 # dtgen files
 *.dtg.cc
 *.dtg.h
diff --git a/.proj.toml b/.proj.toml
index 5592f184ad..10307a6efa 100644
--- a/.proj.toml
+++ b/.proj.toml
@@ -18,9 +18,9 @@ build_targets = [
 ]
 
 test_targets = [
-  # "kernels-tests",
   "utils-tests",
   "op-attrs-tests",
+  "kernels-tests",
   "pcg-tests",
   "substitutions-tests",
   "compiler-tests",
diff --git a/CHANGELOG.md b/CHANGELOG.md
deleted file mode 100644
index f5a03d6563..0000000000
--- a/CHANGELOG.md
+++ /dev/null
@@ -1,19 +0,0 @@
-v20.20 (Dec 20, 20)
-
-### CHANGELOG v20.20
-
-* Build
-    * FlexFlow now supports both Makefile and CMake build. More details are available in [this instruction](https://github.com/flexflow/FlexFlow/blob/master/INSTALL.md).
-* Frontend Supports
-    * **PyTorch**. FlexFlow now supports training existing PyTorch models with minimal changes to the source code. To run PyTorch models in FlexFlow, users can first export a model to the ONNX format using `torch.onnx` and then load an ONNX model in FlexFlow for distributed training. More examples: https://github.com/flexflow/FlexFlow/tree/master/examples/python/pytorch
-    * **ONNX**. FlexFlow supports training existing ONNX models through `flexflow.onnx.model`. More examples: https://github.com/flexflow/FlexFlow/tree/master/examples/python/onnx
-    * **TensorFlow Keras**. Similar to the PyTorch support. `flexflow.keras` enables distributed training of existing TensorFlow Keras models. See [this bootcamp talk](https://www.youtube.com/watch?v=PvFHu__eP9Q) for more details.
-* Parallelization Optimizer
-    * Integrated the parallelization optimizer into the FlexFlow runtime. Users can now use the `--search-budget` and `--search-alpha` to control the FlexFlow parallelization optimizer for searching for optimized strategies. See [this post](https://flexflow.ai/search/) for the usage of the optimizer.
-* Examples
-   * More PyTorch, ONNX, TensorFlow Keras examples have been added to the `/examples/python` folder.
-   * Updated the cpp examples to use the new runtime interface.
-* Mapper
-    * Implemented a new mapper with improved runtime performance.
-* Legion
-    * Updated the Legion version with improved runtime performance
diff --git a/CMakeLists.txt.old b/CMakeLists.txt.old
deleted file mode 100644
index d3d824bea0..0000000000
--- a/CMakeLists.txt.old
+++ /dev/null
@@ -1,414 +0,0 @@
-cmake_minimum_required(VERSION 3.10)
-project(FlexFlow)
-
-include(ExternalProject)
-
-# Set policy CMP0074 to eliminate cmake warnings
-cmake_policy(SET CMP0074 NEW)
-cmake_policy(SET CMP0077 NEW)
-if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
-  # Fix DOWNLOAD_EXTRACT_TIMESTAMP warnings
-  cmake_policy(SET CMP0135 NEW)
-endif()
-set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_LIST_DIR}/cmake)
-set(FLEXFLOW_ROOT ${CMAKE_CURRENT_LIST_DIR})
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -UNDEBUG")
-
-# Set a default build type if none was specified
-set(default_build_type "Debug")
-if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
-  message(STATUS "Setting build type to '${default_build_type}' as none was specified.")
-  set(CMAKE_BUILD_TYPE "${default_build_type}" CACHE
-      STRING "Choose the type of build." FORCE)
-endif()
-
-# do not disable assertions even if in release mode
-set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -UNDEBUG")
-
-if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
-  set(LIBEXT ".so")
-endif()
-
-# only used for pypi
-option(FF_BUILD_FROM_PYPI "Build from pypi" OFF)
-
-# build shared or static flexflow lib
-option(BUILD_SHARED_LIBS "Build shared libraries instead of static ones" ON)
-
-# option for using Python
-option(FF_USE_PYTHON "Enable Python" ON)
-
-# option to download pre-compiled NCCL/Legion libraries
-option(FF_USE_PREBUILT_NCCL "Enable use of NCCL pre-compiled library, if available" ON)
-option(FF_USE_PREBUILT_LEGION "Enable use of Legion pre-compiled library, if available" ON)
-option(FF_USE_ALL_PREBUILT_LIBRARIES "Enable use of all pre-compiled libraries, if available" OFF)
-
-# option for using Python
-option(FF_USE_GASNET "Run FlexFlow with GASNet" OFF)
-set(FF_GASNET_CONDUITS aries udp mpi ibv ucx)
-set(FF_GASNET_CONDUIT "mpi" CACHE STRING "Select GASNet conduit ${FF_GASNET_CONDUITS}")
-set_property(CACHE FF_GASNET_CONDUIT PROPERTY STRINGS ${FF_GASNET_CONDUITS})
-
-
-# option for cuda arch
-set(FF_CUDA_ARCH "autodetect" CACHE STRING "Target CUDA Arch")
-if (FF_CUDA_ARCH STREQUAL "")
-  message(FATAL_ERROR "FF_CUDA_ARCH cannot be an empty string. Set it to `autodetect`, `all`, or pass one or multiple valid CUDA archs.")
-endif()
-
-# option for nccl
-option(FF_USE_NCCL "Run FlexFlow with NCCL" OFF)
-
-if (FF_GPU_BACKEND STREQUAL "hip_rocm" AND FF_USE_NCCL STREQUAL "ON")
-  message(FATAL_ERROR "NCCL: ON for FF_GPU_BACKEND: hip_rocm. hip_rocm backend must have NCCL disabled.")
-endif()
-
-# option for avx2
-option(FF_USE_AVX2 "Run FlexFlow with AVX2" OFF)
-
-# option for max dim
-set(FF_MAX_DIM "4" CACHE STRING "Maximum dimention of tensors")
-
-# option for legion
-option(FF_USE_EXTERNAL_LEGION "Use pre-installed Legion" OFF)
-
-set(FLEXFLOW_EXT_LIBRARIES "")
-set(FLEXFLOW_INCLUDE_DIRS "")
-
-# get FLAGS from ENV
-set(CC_FLAGS $ENV{CC_FLAGS})
-set(NVCC_FLAGS $ENV{NVCC_FLAGS})
-set(LD_FLAGS $ENV{LD_FLAGS})
-
-# Set global FLAGS
-list(APPEND CC_FLAGS
-  -std=c++11)
-
-list(APPEND NVCC_FLAGS
-  -std=c++11)
-
-add_compile_options(${CC_FLAGS})
-set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${NVCC_FLAGS})
-link_libraries(${LD_FLAGS})
-
-# Detect OS type and Linux version (if it applies)
-set(LINUX_VERSION "")
-if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
-  find_program(LSB_RELEASE_EXEC lsb_release)
-  if(LSB_RELEASE_EXEC)
-    execute_process(COMMAND ${LSB_RELEASE_EXEC} -r --short 
-                    OUTPUT_VARIABLE LINUX_VERSION 
-                    OUTPUT_STRIP_TRAILING_WHITESPACE)
-    message(STATUS "Linux Version: ${LINUX_VERSION}")
-  endif()
-endif()
-
-# Detect CPU architecture
-message(STATUS "CPU architecture: ${CMAKE_HOST_SYSTEM_PROCESSOR}")
-
-if(FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "hip_rocm")
-  set(ROCM_PATH "/opt/rocm" CACHE STRING "Default ROCM installation directory.")
-endif()
-
-# ZLIB
-include(zlib)
-
-# CUDA
-if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda")
-  include(cuda)
-endif()
-
-# CUDNN
-if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda")
-  include(cudnn)
-endif()
-
-# NCCL
-if(FF_USE_NCCL)
-  include(nccl)
-  list(APPEND FF_CC_FLAGS
-    -DFF_USE_NCCL)
-  list(APPEND FF_NVCC_FLAGS
-    -DFF_USE_NCCL)
-endif()
-
-# Legion
-include(legion)
-
-# json
-include(json)
-
-# variant
-include(variant)
-
-# optional
-include(optional)
-
-if(FF_USE_PYTHON)
-  list(APPEND FF_CC_FLAGS
-    -DBINDINGS_AUGMENT_PYTHONPATH)
-  list(APPEND FF_NVCC_FLAGS
-    -DBINDINGS_AUGMENT_PYTHONPATH)
-endif()
-
-if (FF_GPU_BACKEND STREQUAL "cuda")
-  list(APPEND FF_CC_FLAGS
-    -DFF_USE_CUDA)
-  list(APPEND FF_NVCC_FLAGS
-    -DFF_USE_CUDA)
-elseif (FF_GPU_BACKEND STREQUAL "hip_cuda")
-  list(APPEND FF_CC_FLAGS
-    -DFF_USE_HIP_CUDA)
-  list(APPEND FF_HIPCC_FLAGS
-    -DFF_USE_HIP_CUDA)
-elseif (FF_GPU_BACKEND STREQUAL "hip_rocm")
-  list(APPEND FF_CC_FLAGS
-    -DFF_USE_HIP_ROCM)
-  list(APPEND FF_HIPCC_FLAGS
-    -DFF_USE_HIP_ROCM)
-else()
-endif()
-
-# Start build FlexFlow
-if (CMAKE_BUILD_TYPE STREQUAL "Debug")
-  list(APPEND FF_CC_FLAGS
-    -DFF_DEBUG)
-  list(APPEND FF_NVCC_FLAGS
-    -DFF_DEBUG)
-endif()
-
-message(STATUS "FlexFlow MAX_DIM: ${FF_MAX_DIM}")
-
-list(APPEND FF_CC_FLAGS
-  -DMAX_TENSOR_DIM=${FF_MAX_DIM})
-
-if(FF_USE_AVX2)
-  list(APPEND FF_CC_FLAGS
-    -DFF_USE_AVX2
-    -mavx2)
-endif()
-
-list(APPEND FF_NVCC_FLAGS
-  -Wno-deprecated-gpu-targets
-  -DMAX_TENSOR_DIM=${FF_MAX_DIM})
-
-list(APPEND FF_LD_FLAGS
-  -lrt
-  -ldl
-  -rdynamic)
-
-# Set FF FLAGS
-add_compile_options(${FF_CC_FLAGS})
-set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${FF_NVCC_FLAGS} -UNDEBUG)
-link_libraries(${FF_LD_FLAGS})
-
-list(APPEND FLEXFLOW_INCLUDE_DIRS
-  ${FLEXFLOW_ROOT}/include
-  ${FLEXFLOW_ROOT})
-
-file(GLOB_RECURSE FLEXFLOW_HDR
-  LIST_DIRECTORIES False
-  ${FLEXFLOW_ROOT}/include/*.h)
-
-file(GLOB_RECURSE FLEXFLOW_SRC
-  LIST_DIRECTORIES False
-  ${FLEXFLOW_ROOT}/src/*.cc)
-list(REMOVE_ITEM FLEXFLOW_SRC "${FLEXFLOW_ROOT}/src/runtime/cpp_driver.cc")
-
-set(FLEXFLOW_CPP_DRV_SRC
-  ${FLEXFLOW_ROOT}/src/runtime/cpp_driver.cc)
-
-add_library(substitution_loader SHARED
-  ${FLEXFLOW_ROOT}/src/runtime/substitution_loader.cc)
-target_include_directories(substitution_loader PRIVATE ${FLEXFLOW_INCLUDE_DIRS})
-target_link_libraries(substitution_loader nlohmann_json::nlohmann_json)
-
-
-#message("FLEXFLOW_INCLUDE_DIRS: ${FLEXFLOW_INCLUDE_DIRS}")
-
-# compile flexflow lib
-if (FF_GPU_BACKEND STREQUAL "cuda")
-  file(GLOB_RECURSE FLEXFLOW_GPU_SRC
-    LIST_DIRECTORIES False
-    ${FLEXFLOW_ROOT}/src/*.cu)
-
-  add_compile_definitions(FF_USE_CUDA)
-
-  if(BUILD_SHARED_LIBS)
-    cuda_add_library(flexflow SHARED ${FLEXFLOW_GPU_SRC} ${FLEXFLOW_SRC} OPTIONS ${CUDA_GENCODE})
-  else()
-    cuda_add_library(flexflow STATIC ${FLEXFLOW_GPU_SRC} ${FLEXFLOW_SRC} OPTIONS ${CUDA_GENCODE})
-  endif()
-elseif(FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "hip_rocm")
-  file(GLOB_RECURSE FLEXFLOW_GPU_SRC
-    LIST_DIRECTORIES False
-    ${FLEXFLOW_ROOT}/src/*.cpp)
-
-  if(BUILD_SHARED_LIBS)
-    add_library(flexflow SHARED ${FLEXFLOW_GPU_SRC} ${FLEXFLOW_SRC})
-  else()
-    add_library(flexflow STATIC ${FLEXFLOW_GPU_SRC} ${FLEXFLOW_SRC})
-  endif()
-
-  list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH}/hip ${ROCM_PATH})
-
-  find_package(hip REQUIRED)
-
-  if (FF_GPU_BACKEND STREQUAL "hip_cuda")
-    # The targets defined by the hip cmake config only target amd devices.
-    # For targeting nvidia devices, we'll make our own interface target,
-    # hip_device_nvidia, that includes the rocm and hip headers. 
-    add_library(hip_device_nvidia INTERFACE)
-
-    if (NOT FF_CUDA_ARCH STREQUAL "")
-      target_compile_options(hip_device_nvidia INTERFACE -arch=compute_${FF_CUDA_ARCH})
-    endif()
-
-    target_include_directories(hip_device_nvidia SYSTEM INTERFACE ${HIP_INCLUDE_DIRS} ${ROCM_PATH}/include)
-    target_include_directories(hip_device_nvidia INTERFACE ${HIP_INCLUDE_DIRS} ${ROCM_PATH}/include)
-
-    add_compile_definitions(FF_USE_HIP_CUDA)
-
-    # Linking cuda: 
-    # We do not explicitly link cuda. hipcc when targeting nvidia will 
-    # use nvcc under the hood. nvcc when used for linking will handle 
-    # linking cuda dependencies
-    target_link_libraries(flexflow hip_device_nvidia)
-  elseif(FF_GPU_BACKEND STREQUAL "hip_rocm")
-    find_package(hipblas REQUIRED)
-    find_package(miopen REQUIRED)
-    # find_package(rocrand REQUIRED)
-    find_library(HIP_RAND_LIBRARY hiprand REQUIRED)
-
-    add_compile_definitions(FF_USE_HIP_ROCM)
-
-    # The hip cmake config module defines three targets, 
-    # hip::amdhip64, hip::host, and hip::device.
-    #
-    # hip::host and hip::device are interface targets. hip::amdhip64 is an 
-    # imported target for libamdhip.
-    #
-    # You do not directly link to hip::amdhip64. hip::host links to hip::amdhip64
-    # and hip::device links to hip::host. Link to hip::host to just use hip without 
-    # compiling any GPU code. Link to hip::device to compile the GPU device code.
-    #
-    # Docs (outdated):
-    # https://rocmdocs.amd.com/en/latest/Installation_Guide/Using-CMake-with-AMD-ROCm.html
-    target_link_libraries(flexflow hip::device roc::hipblas MIOpen ${HIP_RAND_LIBRARY})
-  endif()
-else()
-  message(FATAL_ERROR "Unsupported FF_GPU_BACKEND for cmake: ${FF_GPU_BACKEND}")
-endif()
-
-target_include_directories(flexflow PUBLIC ${FLEXFLOW_INCLUDE_DIRS})
-# LEGION_URL is defined if we found a precompiled Legion library to download
-if(LEGION_URL)
-  # Legion builds produce two library files: one for the Legion runtime and one for the Realm runtime. 
-  # When linking FlexFlow to a precompiled version of Legion, we need to manually link to both library files.
-  target_link_libraries(flexflow ${LEGION_LIBRARY} ${REALM_LIBRARY} nlohmann_json::nlohmann_json mpark_variant optional)
-  add_dependencies(flexflow ${LEGION_NAME})
-else()
-  # When building Legion from source, we do so by calling add_subdirectory(), and obtain a library with both the
-  # Legion and Realm runtimes. The library's name is saved into the LEGION_LIBRARY variable. Hence, we only need
-  # to link FlexFlow to ${LEGION_LIBRARY}
-  target_link_libraries(flexflow ${LEGION_LIBRARY} nlohmann_json::nlohmann_json mpark_variant optional)
-endif()
-
-if(FF_USE_NCCL)
-  add_dependencies(flexflow ${NCCL_NAME})
-endif()
-
-# build binary
-option(FF_BUILD_RESNET "build resnet example" OFF)
-option(FF_BUILD_RESNEXT "build resnext example" OFF)
-option(FF_BUILD_ALEXNET "build alexnet example" OFF)
-option(FF_BUILD_DLRM "build DLRM example" OFF)
-option(FF_BUILD_XDL "build XDL example" OFF)
-option(FF_BUILD_INCEPTION "build inception example" OFF)
-option(FF_BUILD_CANDLE_UNO "build candle uno example" OFF)
-option(FF_BUILD_TRANSFORMER "build transformer example" OFF)
-option(FF_BUILD_MLP_UNIFY "build mlp unify example" OFF)
-option(FF_BUILD_SPLIT_TEST "build split test example" OFF)
-option(FF_BUILD_SPLIT_TEST_2 "build split test 2 example" OFF)
-option(FF_BUILD_ALL_EXAMPLES "build all examples. Overrides others" OFF)
-option(FF_BUILD_UNIT_TESTS "build non-operator unit tests" OFF)
-option(FF_BUILD_SUBSTITUTION_TOOL "build substitution conversion tool" OFF)
-option(FF_BUILD_VISUALIZATION_TOOL "build substitution visualization tool" OFF)
-
-if(FF_BUILD_UNIT_TESTS)
-  set(BUILD_GMOCK OFF)
-  add_subdirectory(deps/googletest)
-  enable_testing()
-  add_subdirectory(tests/unit)
-endif()
-
-if(FF_BUILD_SUBSTITUTION_TOOL)
-  add_subdirectory(tools/protobuf_to_json)
-endif()
-
-if(FF_BUILD_VISUALIZATION_TOOL)
-  add_subdirectory(tools/substitutions_to_dot)
-endif()
-
-# Python
-if(FF_USE_PYTHON)
-  add_subdirectory(deps/pybind11)
-  add_subdirectory(python)
-endif()
-
-if(FF_BUILD_RESNET OR FF_BUILD_ALL_EXAMPLES)
-  add_subdirectory(examples/cpp/ResNet)
-endif()
-
-if(FF_BUILD_RESNEXT OR FF_BUILD_ALL_EXAMPLES)
-  add_subdirectory(examples/cpp/resnext50)
-endif()
-
-if(FF_BUILD_ALEXNET OR FF_BUILD_ALL_EXAMPLES)
-  add_subdirectory(examples/cpp/AlexNet)
-endif()
-
-if(FF_BUILD_MLP_UNIFY OR FF_BUILD_ALL_EXAMPLES)
-  add_subdirectory(examples/cpp/MLP_Unify)
-endif()
-
-if(FF_BUILD_SPLIT_TEST OR FF_BUILD_ALL_EXAMPLES)
-  add_subdirectory(examples/cpp/split_test)
-endif()
-
-if(FF_BUILD_SPLIT_TEST_2 OR FF_BUILD_ALL_EXAMPLES)
-  add_subdirectory(examples/cpp/split_test_2)
-endif()
-
-if(FF_BUILD_INCEPTION OR FF_BUILD_ALL_EXAMPLES)
-  add_subdirectory(examples/cpp/InceptionV3)
-endif()
-
-#TODO: Once functional add to BUILD_ALL_EXAMPLES
-if(FF_BUILD_CANDLE_UNO OR FF_BUILD_ALL_EXAMPLES)
-  add_subdirectory(examples/cpp/candle_uno)
-endif()
-
-if(FF_BUILD_DLRM OR FF_BUILD_ALL_EXAMPLES)
-  add_subdirectory(examples/cpp/DLRM)
-
-  #add_executable(generate_dlrm_hetero_strategy src/runtime/dlrm_strategy_hetero.cc)
-  #target_include_directories(generate_dlrm_hetero_strategy PUBLIC ${FLEXFLOW_INCLUDE_DIRS})
-
-  #add_executable(generate_dlrm_strategy src/runtime/dlrm_strategy.cc)
-  #target_include_directories(generate_dlrm_strategy PUBLIC ${FLEXFLOW_INCLUDE_DIRS})
-endif()
-
-if(FF_BUILD_XDL OR FF_BUILD_ALL_EXAMPLES)
-  add_subdirectory(examples/cpp/XDL)
-endif()
-
-if(FF_BUILD_TRANSFORMER OR FF_BUILD_ALL_EXAMPLES)
-  add_subdirectory(examples/cpp/Transformer)
-endif()
-
-# installation
-set(INCLUDE_DEST "include")
-set(LIB_DEST "lib")
-install(FILES ${FLEXFLOW_HDR} DESTINATION ${INCLUDE_DEST})
-install(TARGETS flexflow DESTINATION ${LIB_DEST})
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index e42f7e5003..1a1b3c9bee 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,233 +1,261 @@
 # Developers Guide
 
-## Code Organization
-The bulk of the FlexFlow source code is stored in the following folders:
-
-1. `examples`: example DNNs in C++ and Python
-2. `include`: the FlexFlow headers
-3. `src`: the FlexFlow source code
-4. `python`: bindings for the Python interface
-
-The `src` folder is divided into the following subfolders:
-
-1. `loss_functions`: contains the implementation of all the supported loss functions, as well as the backward function to be used during training.
-2. `mapper`: contains the implentation of the Legion custom mapper for FlexFlow, `FFMapper`.
-3. `metric_functions`: contains the implementation of all the metrics functions, such as accuracy, categorical crossentropy, or mean squared error.
-4. `ops`: contains the implementation of all tensor operators.
-5. `parallel_ops`: contains the operators used to represent parallelization on the Parallel Computation Graph (PCG) as described in the [Unity paper](https://www.usenix.org/system/files/osdi22-unger.pdf).
-6. `recompile`: support for the dynamic recompilation functionality described in [this paper](https://arxiv.org/pdf/2205.01848.pdf)
-7. `runtime`: contains the implementation of the high-level FlexFlow runtime
-8. `utils`: only contains implementation of the RecordFormatter class.
-
-In many parts of the source code you will see triplets of files with the following three different extensions: `.cc`, `.cpp` and `.cu`. The `.cc` file contains the main, high-level C++ implementation, whereas the `.cpp` and `.cu` file contain, respectively, the HIP and CUDA kernels.
+## Setup
+
+> [!NOTE]
+> If you are developing on Stanford's sapling cluster, instead see the instructions [here](./docs/SAPLING.md).
+> If you don't know what this means, you're not using sapling so you should just continue reading.
+
+1. FlexFlow Train uses [nix](https://nix.dev/manual/nix/2.24/) to manage dependencies and the development environment. 
+   There exist a number of ways to install nix, but we recommend one of the following:
+
+   1. If you have root permissions: [DeterminateSystems/nix-installer](https://github.com/DeterminateSystems/nix-installer)
+
+   2. If you don't have root permissions: [DavHau/nix-portable](https://github.com/DavHau/nix-portable). 
+      Note that nix-portable does not work particularly well if the nix store is in NFS[^1] or other distributed file systems, 
+      so if you are running on an HPC cluster where the home directory is mounted via a distributed file system we recommend setting the 
+      `NP_LOCATION` environment to `/tmp` or some other non-NFS location. 
+
+      While you should at least skim nix-portable's setup instructions, you'll probably end up doing something like this:
+      ```
+      $ USERBIN="${XDG_BIN_HOME:-$HOME/.local/bin}"
+      $ wget 'https://github.com/DavHau/nix-portable/releases/download/v010/nix-portable' -O "$USERBIN/nix-portable"
+      ...
+      $ chmod u+x "$USERBIN/nix-portable"
+      ...
+      $ ln -sf "$USERBIN/nix-portable" "$USERBIN/nix"
+      ...
+      $ echo 'export PATH=$USERBIN:$PATH' >> ~/.bashrc
+      ...
+      ```
+      Now if everything is setup properly, you should be able to see something like the following (don't worry if the version number is slightly different) if you run `nix --version`:
+      ```
+      $ nix --version
+      nix (Nix) 2.20.6
+      ```
+
+[^1]: [Network File System](https://en.wikipedia.org/wiki/Network_File_System)
+
+2. Clone the FlexFlow Train repository (or, if you'd prefer, follow the alternative setup instructions in the [ff-dev](#ff-dev-optional) section)
 
-The best way to familiarize with the FlexFlow codebase is to walk through one of the existing examples, then check out the relevant FlexFlow runtime functions that are used in the example. We provide examples in both Python and C++. The Python interface is the most up-to-date, and the one that is intended to be used by users. To learn how to _run_ a DNN in FlexFlow, please refer to the scripts in the [examples/python](https://github.com/flexflow/FlexFlow/tree/master/examples/python) folder. The C++ interface is intended mostly for development purposes and may have some rough edges. Nevertheless, the C++ examples are the preferred ones to look at if you want to familiarize with the internals of the FlexFlow implementation. 
-
-### AlexNet example (C++)
-
-In this section, we will walk through the AlexNet C++ implementation, which can be found in the [examples/cpp/AlexNet](https://github.com/flexflow/FlexFlow/tree/master/examples/cpp/AlexNet) folder of the repository. You can use this example as a template to write your own C++ DNN model using FlexFlow. 
-
-You can start by taking a look at the `alexnet.cc` file, containing the core of the implementation. You will notice the absence of a `main()` function. The FlexFlow C++ interface uses the `main()` function defined in [src/runtime/cpp_driver.cc](https://github.com/flexflow/FlexFlow/blob/master/src/runtime/cpp_driver.cc), so you will not need to create a new one when writing a FlexFlow program. Instead, you will use a function called `top_level_task` and with the following signature:
-
-```c++
-void FlexFlow::top_level_task(Task const *task,
-                              std::vector<PhysicalRegion> const &regions,
-                              Context ctx,
-                              Runtime *runtime);
 ```
-
-Inside the `top_level_task` function, you will want to create a FFModel object, which is usually initialized by passing a FFConfig object to the constructor:
-
-```c++
-FFConfig ffConfig;
-FFModel ff(ffConfig);
+$ FF_DIR="$HOME/flexflow-train" # or wherever else you want to put the repository
+$ git clone --recursive git@github.com:flexflow/flexflow-train.git "$FF_DIR"
+...
 ```
 
-`FFModel` is a very large class, and is the cornerstone of every FlexFlow DNN, providing the methods required to instantiate input tensors, add layers, compile the model, etc... 
-
-#### Tensor creation
-
-The typical first step in a FlexFlow DNN is to define the input tensors. You can do that using the `FFModel.create_tensor` function. In the case of AlexNet:
-
-```c++
-Tensor input;
-{
-	int const dims[] = {ffConfig.batchSize, 3, 229, 229};
-	input = ff.create_tensor<4>(dims, DT_FLOAT);
-}
-```  
-
-In the case of AlexNet, the input tensor has dimension `batch_size x 3 x 229 x 229`, so it is a 4-dimensional tensor. To initialize the tensor, we use the templated `create_tensor` function, which is part of `FFModel`. It may be useful to know that the `create_tensor` function lays out the tensor's dimensions in reverse order. For instance, in the snippet above, printing the `input` tensor (which can be done using the instruction below) will show dimensions: `[229, 229, 3, batch_size]`. 
-
-```c++
-input->print("input tensor")
-``` 
+3. Enter the nix-provided `default` development environment[^2]
 
-There are two versions of the `create_tensor` function: one (used in the last snippet above) uses a template that takes the number of tensor dimensions as its parameter; the second is a wrapper around the first, and takes the number of tensor dimensions as a regular function parameter. Both versions are implemented in `model.cc`, and their signature is identical, except for the number of dimensions parameter. Below, we discuss the implementation of the `create_tensor` wrapper, since it illustrates a common pattern among FlexFlow functions:
+[^2]: aka "dev shell"
 
-```c++
-Tensor FFModel::create_tensor(int numdim,
-                              int const dims[],
-                              DataType data_type,
-                              Layer const *layer,
-                              int idx,
-                              bool create_grad) {
-  switch (numdim) {
-#define DIMFUNC(DIM)                                                           \
-  case DIM:                                                                    \
-    return create_tensor<DIM>(dims, data_type, layer, idx, create_grad);
-    LEGION_FOREACH_N(DIMFUNC)
-#undef DIMFUNC
-    default:
-      assert(false && "Unsupported dim!");
-  }
-}
 ```
-
-The `LEGION_FOREACH_N(DIMFUNC)` macro is defined in [deps/legion/runtime/legion/legion_config.h](https://gitlab.com/StanfordLegion/legion/-/blob/master/runtime/legion/legion_config.h). The preprocessor replaces the block of code between `#define DIMFUNC(DIM)` and `#undef DIMFUNC` with a `case` statement for each integer between `1` and the `LEGION_MAX_DIM`, controlled by the `Legion_MAX_DIM` Legion CMake variable, which in case of FlexFlow, is set equal to `FF_MAX_DIM` in [cmake/legion.cmake](https://github.com/flexflow/FlexFlow/blob/master/cmake/legion.cmake). For example, in the default case, where `FF_MAX_DIM` is set to 4, the preprocessor will rewrite the `switch` loop above as follows:
-
-```c++
-switch (numdim) {
-	case 1:
-		return create_tensor<1>(dims, data_type, layer, idx, create_grad);
-	case 2:
-		return create_tensor<2>(dims, data_type, layer, idx, create_grad);
-	case 3:
-		return create_tensor<3>(dims, data_type, layer, idx, create_grad);
-	case 4:
-		return create_tensor<4>(dims, data_type, layer, idx, create_grad);
-	default:
-		assert(false && "Unsupported dim!");
-}
+$ cd "$FF_DIR"
+$ nix develop --accept-flake-config
 ```
 
-In addition to the two versions of `create_tensor` discussed above, `model.cc` also offers the `create_tensor_legion_ordering` function, which simply creates a tensor without reversing the order of the input dimensions. The explicit template instantiations at the bottom of `model.cc` will ensure that functions such `create_tensor` are only instantiated for number of dimensions that are less or equal to `FF_MAX_DIM`.
-
-#### Adding layers to a DNN model
+4. Build and run the non-GPU-required tests (systems that have access to CUDA GPUs can also run the GPU-mandatory tests by following the instructions [here](#gpu-setup))
 
-Going back to the AlexNet example, after defining the input tensors, we can add each of the DNN's layers by using the corresponding method from `FFModel`. For instance, the first layer is added using: 
-
-```c++
-t = ff.conv2d(input, 64, 11, 11, 4, 4, 2, 2, AC_MODE_RELU);
 ```
-The `conv2d` function is defined in [src/ops/conv_2d.cc](https://github.com/flexflow/FlexFlow/blob/master/src/ops/conv_2d.cc). Just like the other `FFModel` layer functions, it creates a new `Layer` object, populates with all relevant properties, and then enqueues to the list of layers in the `FFModel` class. 
-
-#### Optimizer and training metrics
+(ff) $ proj cmake
+...
+(ff) $ proj test --skip-gpu-tests
+...
+```
+If everything is correctly configured, you should see a bunch of build messages followed by something like
+```
+(ff) $ proj test --skip-gpu-tests
+421/421 Test #441: get_transformer_computation_graph
+100% tests passed, 0 tests failed out of 421
+
+Label Time Summary:
+compiler-tests                  =   6.13 sec*proc (19 tests)
+local-execution-tests           =   0.13 sec*proc (3 tests)
+models-tests                    =   0.05 sec*proc (4 tests)
+op-attrs-tests                  =   0.48 sec*proc (59 tests)
+pcg-tests                       =   0.33 sec*proc (33 tests)
+substitution-generator-tests    =   0.06 sec*proc (2 tests)
+substitutions-tests             =   0.10 sec*proc (9 tests)
+utils-tests                     =   1.20 sec*proc (293 tests)
+
+Total Test time (real) =   8.64 sec
+```
 
-After adding the DNN layers, the next step before compiling the model for training is to initialize an optimizer and then create a vector with all the metrics that you want to monitor at each training step.
+If you don't, or if you see any tests failing, please double check that you have followed the instructions above. 
+If you have and are still encountering an issue, please [contact us](#contact-us) with a detailed description of your platform and the commands you have run.
 
+### GPU setup
 
-#### Model compilation
+If you are developing on a machine with one or more CUDA GPUs, you can also run the tests that require a GPU by entering the `gpu` devshell instead of the `default` devshell:
+```
+$ NIXPKGS_ALLOW_UNFREE=1 nix develop .#gpu --accept-flake-config --impure
+```
+and then running
+```
+(ff) $ proj test
+...
+```
+You should see the additional GPU tests run. If you instead see a message like 
 
-TODO
+> `Error: ... Pass --skip-gpu-tests to skip running tests that require a GPU`
 
+Double check that you are correctly in the `gpu` devshell, not the `default` devshell. 
+If you've confirmed that you are in the correct devshell and are still encountering issues, [contact us](#contact-us) 
+with a detailed description of your platform and the commands you have run.
 
-## Continuous Integration
-We currently implement CI testing using Github Workflows. Each workflow is defined by its corresponding YAML file in the [.github/workflows](.github/workflows) folder of the repo. We currently have the following workflows:
+### ff-dev (optional)
 
-1. `build.yml`: checks that the build & installation of FlexFlow succeed, using both the CMake and Makefile systems
-2. `clang-format-check.yml`: ensures that the source code is properly formatted.
-3. `docker-build.yml`: checks that the Docker containers can build and run FlexFlow properly. It also publishes a new version of the FlexFlow containers to the repo's package register for each push to the master branch
-4. `gpu-ci.yml`: runs all the tests that require a GPU to run.
-5. `gpu-ci-daemon.yml`: an helper workflow that turns on/off the GPU instance used by the test above
-6. `multinode-test.yml`: runs the same GPU tests from the `gpu-ci.yml` workflow, but using multiple (simulated) nodes. The test currently simulates two nodes, each with 2 GPUs. To run FlexFlow on multiple nodes, we compile Legion with GASNET enabled, and choose MPI as the GASNET conduit. Compared to the single-node version, this test is much more time-consuming (about 4h instead 40mins at the time of writing), so we only run the test on the FlexFlow `master` branch every other day.
-7. `pip-install.yml`: checks the build & installation of FlexFlow using `pip`
-8. `shell-check.yml`: runs shellcheck on all bash scripts in the repo
+Many of the FlexFlow Train developers use an additional set of scripts called [ff-dev](https://github.com/lockshaw/ff-dev) 
+to automate many common git operations associated with FlexFlow Train development. 
 
-We also have three placeholder workflows: `build-skip.yml`, `docker-build-skip.yml`, `gpu-ci-skip` and `pip-install-skip.yml`. These always pass and are used only in the case of skipped workflows whose status is required to merge a PR; we implement the "hack" officially recommended by Github ([see here](https://docs.github.com/en/repositories/configuring-branches-and-merges-in-your-repository/defining-the-mergeability-of-pull-requests/troubleshooting-required-status-checks#handling-skipped-but-required-checks)).
+To setup ff-dev, run TODO (tracked in [#1573](https://github.com/flexflow/flexflow-train/issues/1573)).
 
-In the next section, we walk through an example workflow, similar to the ones found in this repo. An important thing to note is that Github workflows do not run unless they are properly linted. If you encounter a formatting/linting error, you can lint your workflow file using `prettier` (installation instructions [here](https://prettier.io/docs/en/install.html)):
+<!--
+To use ff-dev, instead of cloning the FlexFlow Train repo directly, you'll instead clone ff-dev to `~/ff`:
 
-```bash
-yarn prettier --write <filename.yml>
+```console
+$ git clone --recursive git@github.com:lockshaw/ff-dev.git "$HOME/ff"
 ```
 
-### Github Workflow syntax
+and then run the `ff-dev-init` command from within the nix environment provided by `ff-dev`:
 
-In this section, we will walk through an example workflow:
-
-```yaml
-name: "build"
+```
+$ cd ~/ff
+$ nix develop . --accept-flake-config
+...
+$ ff-dev-init
+...
+```
 
-on:
-  pull_request:
-    paths:
-      - "src/**"
-      - ".github/workflows/build.yml"
-  push:
-    paths:
-      - "src/**"
-      - ".github/workflows/build.yml"
-    branches:
-      - "master"
-  schedule:
-    # Run weekly on Saturday at midnight PT (3am ET / 8am UTC)
-    - cron: "0 8 * * 6"
-  workflow_dispatch:
+> [!NOTE]
+> The development environment provided by ff-dev is different than the environment provided 
+> by FlexFlow Train. Whenever you are running any scripts from ff-dev, make sure that your 
+> shell prompt begins with `(ff-dev)`. Whenever you are actually doing FlexFlow Train development,
+> make sure that your shell prompt begins with `(ff)`.
 
-concurrency:
-  group: build-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
+As part of `ff-dev-init`, you'll likely need to add a github authentication token to allow `ff-dev` to
+create and modify your fork of the FlexFlow Train repository. 
+If this is necessary, you'll see a prompt saying something like 
 
-jobs:
-  cmake-build:
-    name: Build FlexFlow with CMake
-    runs-on: ubuntu-20.04
-    steps:
-      - name: Checkout Git Repository
-        uses: actions/checkout@v3
-        with:
-          submodules: recursive
+```console
+? What account do you want to log into?  [Use arrow keys to move, type to filter]
+...
+```
+At this point, perform the following steps:
+
+1. Select "GitHub.com"
+2. Select "SSH"
+3. Select "Yes"
+4. Select "Paste an authentication token"
+5. Now go to <https://github.com/settings/tokens> and click "Generate new token" in the top right-hand corner, in the dropdown that appears select "Generate new token (classic)"
+6. You should see a text field called "Note". Enter a brief name to remind yourself what this key is for.
+7. Under "Expiration" select "90 days"
+8. Under "Select scopes" check the following check boxes: `repo`, `read:org`, and `admin:public_key`
+9. Click "Generate token"
+10. You should now see a key beginning with `ghp_`. Copy this, save it somewhere to your computer safe (if you lose it, github won't show it to you again)
+11. Copy the key beginning with `ghp_` into the prompt "Paste your authentication token:" and hit enter.
+12. You should now see a message that says "Logged in as \<your github username\>", followed by a bunch of output from git as it clones the FlexFlow repository.
+
+Once these steps are completed, you should be able to `cd ~/ff/master` and resume the standard setup instructions from step 3 (i.e., entering the nix-provided development environment).
+You can find more instructions for how to use ff-dev [here]().
+-->
+
+### nix-direnv (optional)
+
+If you installed nix system-wide (e.g., using [DeterminateSystems/nix-installer](https://github.com/DeterminateSystems/nix-installer)), 
+you can use [direnv](https://direnv.net/) to automatically enter the FlexFlow Train development environment when you `cd` into the repository, rather
+than having to manually run `nix develop`.
+[direnv](https://direnv.net) will also automatically exit the environment when you `cd` out of the repository, and (if configured using [nix-direnv](https://github.com/nix-community/nix-direnv)) will even automatically reload the environment if the `flake.nix` file changes.
+You can find the installation instructions for direnv [here](https://direnv.net/docs/installation.html), and if you would like automatic environment reloading you can also install nix-direnv using the instructions [here](https://github.com/nix-community/nix-direnv?tab=readme-ov-file#installation).
+
+Once you have direnv (and optionally nix-direnv) installed, cd into the root of your cloned FlexFlow Train repository and run
+```
+$ echo 'use flake . --accept-flake-config' > .envrc
+```
+You should see a message that the `.envrc` file you just created is blocked. 
+Run the command shown in the error message (i.e., `direnv allow`), and direnv should automatically place you in the environment.
+For more information on using direnv with nix, see [here](https://github.com/direnv/direnv/wiki/Nix).
 
-      - name: Install CUDA
-        uses: Jimver/cuda-toolkit@v0.2.8
-        id: cuda-toolkit
-        with:
-          cuda: "11.1.1"
-          # Disable caching of the CUDA binaries, since it does not give us any significant performance improvement
-          use-github-cache: "false"
+## Building, Testing, etc.
 
-      - name: Install FlexFlow Dependencies
-        run: .github/workflows/helpers/install_dependencies.sh
+Most operations you'll want to perform while developing FlexFlow Train are provided through a small python utility called [proj](https://github.com/lockshaw/proj). 
+`proj` is automatically pulled in by nix when you enter the dev shell, so you should be able to run 
+```
+(ff) $ proj -h
 ```
+and see the full list of operations that `proj` supports.
+`proj` commands can be run from anywhere in the repository (i.e., they do not have to be run from the root).
+To help you get started, however, a list of common command invocations is included here:
+
+- To build FlexFlow Train:
+  ```
+  (ff) $ proj build
+  ```
+- To build and run FlexFlow Train tests (without a GPU):
+  ```
+  (ff) $ proj test --skip-gpu-tests
+  ```
+- To build and run FlexFlow Train tests (with a GPU):
+  ```
+  (ff) $ proj test
+  ```
+- To regenerate CMake files (necessary anytime you switch branches or modify the CMake source. If you're ever running into weird build issues, try running this and see if it fixes things):
+  ```
+  (ff) $ proj cmake
+  ```
+- To format all of the FlexFlow Train sources files: 
+  ```
+  (ff) $ proj format
+  ```
+- To build the FlexFlow Train Doxygen docs:
+  ```
+  (ff) $ proj doxygen
+  ```
+  You can also add the `--browser` command to automatically open the built docs in your default browser if you are working on your local machine.
 
-The first instruction in a workflow file sets the workflow's name. The name is not required to be unique, but it is preferrable to use unique names to avoid conflicts. 
+## Code Organization
 
-Next, the `on:` section allows you to control what events trigger a workflow run. A full list of events that can trigger a workflow run is available [here](https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows). Each trigger can take options that further filter out the scenarios where the workflow runs. In the example above, we have the following triggers: 
+The bulk of the FlexFlow source code is stored in the following folders:
 
-1. A `pull_request` trigger, triggering a workflow run when a PR is opened, and for each new commit to a branch associated with an open PR. The `paths` option allows you to choose which files in the repository need to be modified to make the workflow run. For instance, in the example, the `pull_request` trigger is only activated for PRs where either `.github/workflows/build.yml` or a file in the `src` folder is modified. 
-2. A `push` trigger, triggering a run for each push, no matter if there is an open PR or not. Here, in addition to the `paths` option, we have a `branches` option, restricting the trigger to activate only for commits to the `master` branch, but not for commits to other branches.
-3. A `schedule` trigger, triggering the workflow at specific times. The syntax for chron workflows is explained [here](https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#onschedule).
-4. A `workflow_dispatch` trigger, enabling authorized users to manually run the workflow.
+1. `lib`: The C++ code that makes up FlexFlow's core, split up into a number of libraries. You can find a description of each library [here](./lib/README.md).
+2. `bin`: Command-line interfaces for FlexFlow and associated tools (all in C++). Generally, these are just thin wrappers that parse command-line arguments and then call out to functions defined in `lib` for the actual processing/logic. You can find a description of each binary [here](./bin/README.md).
+3. `bindings`: Python (or any additional languages added in the future) bindings for FlexFlow Train
+4. `docs`: Config files for documentation generators and code for generating diagrams. The actual documentation itself is included in the source directories/files as either `.md` files or inline in the language's documentation syntax (i.e., [Doxygen](https://www.doxygen.nl/manual/index.html) for C++ and [Sphinx](https://www.sphinx-doc.org/en/master/) for Python).
+5. `cmake`: CMake configuration for building FlexFlow Train. Note that unless you're modifying the build configuration (i.e., adding a library, additional dependencies, etc.), you generally should use [proj](#building-testing-etc) instead of interacting with CMake directly. 
+6. `deps`: Third-party dependencies included as submodules. Note that since FlexFlow Train moved to [nix](https://nix.dev/manual/nix/2.24/) for managing dependencies many (but not all) of these are used in the default configuration.
 
-There are many additional options that are not discussed here. For example, there is a `paths-ignore` option that allows you to run the workflow in any case except if a file at the specified paths is modified.
+## Continuous Integration
 
-Next, the `concurrency` section allows you to control how many copies of the same workflow can run in parallel. This is useful, for example, when one pushes a new commit to a branch before the workflows for the previous commits have finished running. Since the old commit is now obsolete, there is no need to wait until the old workflow has finished running before running again on the newer commit. In the example above, for example, we use the `concurrency` section to cancel any queued or in-progress workflow when a newer one is triggered.
+We currently implement CI testing using Github Workflows. Each workflow is defined by its corresponding YAML file in the [.github/workflows](.github/workflows) folder of the repo. We currently have the following workflows:
 
-Finally, we define the jobs that will run when the workflow is triggered. Each job is specified by adding an indented entry to the `jobs:` section, and will run in parallel in a isolated container. Multiple jobs in the same workflow do not directly share files. The `runs-on` option allows you to control what type of runner to use for the job. In the example, we use `runs-on: ubuntu-20.04` to run the job on a VM with Ubuntu 20.04. You can also set up the workflow to run on a self-hosted machine by using the option `runs-on: self-hosted` and following the instructions at [this link](https://docs.github.com/en/actions/hosting-your-own-runners/adding-self-hosted-runners) to connect the self hosted machine to the repository. 
+1. [`tests`](./.github/workflows/per-lib-check.yml): Builds and runs GPU and non-GPU unit tests for all of the code under `lib` and `bin`. Also uploads coverage numbers to [codecov.io](https://app.codecov.io/gh/flexflow/flexflow-train).
+2. [`clang-format-check.yml`](./.github/workflows/clang-format-check.yml): ensures that the source code is properly formatted using `clang-format`. To format your code locally, run `proj format` (see [here](#building-testing-etc) for more information on `proj`).
+4. [`shell-check.yml`](./.github/workflows/shell-check.yml): runs shellcheck on all bash scripts in the repo.
 
-Each step in a job will be executed sequentially, and if it fails, the remaining steps will be cancelled and the job will be marked as `failed`. Each step is specified by either reusing a Github action or running a shell command (or a script file). For instance, in the example above, the first step uses the Github Action `actions/checkout@v3` to check out the repository, the second step uses the `Jimver/cuda-toolkit@v0.2.8` action to install CUDA, whereas the third step runs a bash script stored in the repo at the path `.github/workflows/helpers/install_dependencies.sh`.
+GPU machines for CI are managed using [runs-on](https://runs-on.com/).
 
 ## Contributing to FlexFlow
-We want to make contributing to this project as easy and transparent as possible.
 
-### Formatting
-We use `clang-format` to format our C++ code. If you make changes to the code and the Clang format CI test is failing, you can lint your code by running: `./scripts/format.sh` from the main folder of this repo.
+We actively welcome your pull requests. Note that we may already be working on the feature/fix you're looking for, so we suggest searching through the [open issues](https://github.com/flexflow/flexflow-train/issues), [open PRs](https://github.com/flexflow/flexflow-train/pulls), and [contacting us](#contact-us) to make sure you're not duplicating existing effort!
+
+The steps for getting changes merged into FlexFlow are relatively standard:
+
+1. [Fork the repo](https://github.com/flexflow/flexflow-train/fork) and either create a new branch based on `master`, or just modify `master` directly.
+2. If you've added code that should be tested, add tests. The process for adding tests for code under `lib` is documented [here](./lib/README.md#tests). Adding tests for other parts of the code is currently undocumented, so you will [contact us](#contact-us) for information on how to do it.
+3. Ensure the code builds (i.e., run `proj build`).
+4. Ensure the test suite passes (i.e., run `proj test`).
+5. Format the code (i.e., run `proj format`).
+6. Create a new PR from your modified branch to the `master` branch in FlexFlow Train. 
+   Provide a brief description of the changes you've made and link any related/closed issues.
 
-### Pull Requests
-We actively welcome your pull requests.
+Code review is done using [Reviewable](https://reviewable.io/).
+If you haven't used Reviewable before, please read through (or at least skim) the ["Reviews" section](https://docs.reviewable.io/reviews.html) of the Reviewable documentation.
 
-1. Fork the repo and create your branch from `master`.
-2. If you've added code that should be tested, add tests.
-3. If you've changed APIs, update the documentation.
-4. Ensure the test suite passes.
-5. Make sure your code lints.
+## Contact Us
 
-### Issues
-We use GitHub issues to track public bugs. Please ensure your description is
-clear and has sufficient instructions to be able to reproduce the issue.
+Either [create an issue](https://github.com/flexflow/flexflow-train/issues/new) or join the FlexFlow [Zulip](https://flexflow.zulipchat.com/join/mtiwtwttgggnivrkb6vlakbr/) instance. 
+For any reported bugs, please ensure that your description clear and has sufficient information for us to reproduce the issue.
 
-### License
-By contributing to FlexFlow, you agree that your contributions will be licensed
-under the LICENSE file in the root directory of this source tree.
+## License
 
+By contributing to FlexFlow Train, you agree that your contributions will be licensed
+under the [LICENSE](./LICENSE) file in the root directory of this source tree.
diff --git a/FlexFlow.mk b/FlexFlow.mk
deleted file mode 100644
index 4d63ec83d4..0000000000
--- a/FlexFlow.mk
+++ /dev/null
@@ -1,184 +0,0 @@
-# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-ifndef FF_HOME
-$(error FF_HOME variable is not defined, aborting build)
-endif
-
-ifndef LG_RT_DIR
-LG_RT_DIR	?= $(FF_HOME)/deps/legion/runtime
-endif
-
-ifndef CUDA_HOME
-CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc | head -1))
-endif
-
-ifndef CUDNN_HOME
-CUDNN_HOME = $(CUDA_HOME)
-endif
-
-ifndef NCCL_HOME
-NCCL_HOME = $(CUDA_HOME)
-endif
-
-ifndef HIPLIB_HOME
-HIPLIB_HOME = /opt/rocm-4.3.1
-endif
-
-#ifndef MPI_HOME
-#MPI_HOME = $(patsubst %/bin/mpicc,%,$(shell which mpicc | head -1))
-#endif
-
-ifeq ($(strip $(USE_GASNET)),1)
-  ifndef GASNET
-  $(error USE_GASNET is enabled, but GASNET variable is not defined, aborting build)
-  endif
-endif
-
-GEN_SRC += $(shell find $(FF_HOME)/src/loss_functions/ -name '*.cc')\
-		$(shell find $(FF_HOME)/src/mapper/ -name '*.cc')\
-		$(shell find $(FF_HOME)/src/metrics_functions/ -name '*.cc')\
-		$(shell find $(FF_HOME)/src/ops/ -name '*.cc')\
-		$(shell find $(FF_HOME)/src/parallel_ops/ -name '*.cc')\
-		$(shell find $(FF_HOME)/src/recompile/ -name '*.cc')\
-		$(shell find $(FF_HOME)/src/runtime/ -name '*.cc')\
-		$(shell find $(FF_HOME)/src/utils/dot/ -name '*.cc')
-GEN_SRC := $(filter-out $(FF_HOME)/src/runtime/cpp_driver.cc, $(GEN_SRC))
-
-FF_CUDA_SRC += $(shell find $(FF_HOME)/src/loss_functions/ -name '*.cu')\
-		$(shell find $(FF_HOME)/src/mapper/ -name '*.cu')\
-		$(shell find $(FF_HOME)/src/metrics_functions/ -name '*.cu')\
-		$(shell find $(FF_HOME)/src/ops/ -name '*.cu')\
-		$(shell find $(FF_HOME)/src/parallel_ops/ -name '*.cu')\
-		$(shell find $(FF_HOME)/src/recompile/ -name '*.cu')\
-		$(shell find $(FF_HOME)/src/runtime/ -name '*.cu')\
-		$(shell find $(FF_HOME)/src/utils/dot/ -name '*.cu')
-
-FF_HIP_SRC += $(shell find $(FF_HOME)/src/loss_functions/ -name '*.cpp')\
-		$(shell find $(FF_HOME)/src/mapper/ -name '*.cpp')\
-		$(shell find $(FF_HOME)/src/metrics_functions/ -name '*.cpp')\
-		$(shell find $(FF_HOME)/src/ops/ -name '*.cpp')\
-		$(shell find $(FF_HOME)/src/parallel_ops/ -name '*.cpp')\
-		$(shell find $(FF_HOME)/src/recompile/ -name '*.cpp')\
-		$(shell find $(FF_HOME)/src/runtime/ -name '*.cpp')\
-		$(shell find $(FF_HOME)/src/utils/dot/ -name '*.cpp')
-		
-GEN_GPU_SRC += $(FF_CUDA_SRC)
-ifeq ($(strip $(HIP_TARGET)),CUDA)
-  GEN_HIP_SRC += $(FF_CUDA_SRC)
-else
-  GEN_HIP_SRC += $(FF_HIP_SRC)
-endif
-
-ifneq ($(strip $(FF_USE_PYTHON)), 1)
-  GEN_SRC		+= ${FF_HOME}/src/runtime/cpp_driver.cc
-endif
-
-
-INC_FLAGS	+= -I${FF_HOME}/include -I${FF_HOME}/deps/optional/include -I${FF_HOME}/deps/variant/include -I${FF_HOME}/deps/json/include
-CC_FLAGS	+= -DMAX_TENSOR_DIM=$(MAX_DIM) -DLEGION_MAX_RETURN_SIZE=32768
-NVCC_FLAGS	+= -DMAX_TENSOR_DIM=$(MAX_DIM) -DLEGION_MAX_RETURN_SIZE=32768
-HIPCC_FLAGS     += -DMAX_TENSOR_DIM=$(MAX_DIM) -DLEGION_MAX_RETURN_SIZE=32768
-GASNET_FLAGS	+=
-# For Point and Rect typedefs
-CC_FLAGS	+= -std=c++11
-NVCC_FLAGS	+= -std=c++11
-HIPCC_FLAGS     += -std=c++11
-
-ifeq ($(strip $(FF_USE_NCCL)), 1)
-INC_FLAGS	+= -I$(MPI_HOME)/include -I$(NCCL_HOME)/include
-CC_FLAGS	+= -DFF_USE_NCCL
-NVCC_FLAGS	+= -DFF_USE_NCCL
-HIPCC_FLAGS     += -DFF_USE_NCCL
-LD_FLAGS	+= -L$(NCCL_HOME)/lib -lnccl
-endif
-
-ifeq ($(strip $(FF_USE_AVX2)), 1)
-CC_FLAGS	+= -DFF_USE_AVX2 -mavx2
-endif
-
-ifeq ($(strip $(USE_CUDA)),1)
-CC_FLAGS	+= -DFF_USE_CUDA
-NVCC_FLAGS	+= -DFF_USE_CUDA
-INC_FLAGS	+= -I$(CUDNN_HOME)/include -I$(CUDA_HOME)/include
-LD_FLAGS	+= -lcudnn -lcublas -lcurand -L$(CUDNN_HOME)/lib64 -L$(CUDA_HOME)/lib64
-endif
-
-ifeq ($(strip $(USE_HIP)),1)
-ifeq ($(strip $(HIP_TARGET)),CUDA)
-CC_FLAGS	+= -DFF_USE_HIP_CUDA
-HIPCC_FLAGS	+= -DFF_USE_HIP_CUDA
-INC_FLAGS	+= -I$(CUDNN_HOME)/include -I$(CUDA_HOME)/include
-LD_FLAGS	+= -lcudnn -lcublas -lcurand -L$(CUDNN_HOME)/lib64 -L$(CUDA_HOME)/lib64
-else
-CC_FLAGS	+= -DFF_USE_HIP_ROCM
-HIPCC_FLAGS	+= -DFF_USE_HIP_ROCM
-INC_FLAGS	+= -I$(HIPLIB_HOME)/include -I$(HIPLIB_HOME)/include/miopen -I$(HIPLIB_HOME)/include/rocrand -I$(HIPLIB_HOME)/include/hiprand 
-LD_FLAGS	+= -lMIOpen -lhipblas -lhiprand -L$(HIPLIB_HOME)/lib
-endif
-endif
-
-# CUDA arch variables
-GPU_ARCH ?= all
-
-# translate legacy arch names into numbers
-ifeq ($(strip $(GPU_ARCH)),pascal)
-override GPU_ARCH = 60
-NVCC_FLAGS	+= -DPASCAL_ARCH
-endif
-ifeq ($(strip $(GPU_ARCH)),volta)
-override GPU_ARCH = 70
-NVCC_FLAGS	+= -DVOLTA_ARCH
-endif
-ifeq ($(strip $(GPU_ARCH)),turing)
-override GPU_ARCH = 75
-NVCC_FLAGS	+= -DTURING_ARCH
-endif
-ifeq ($(strip $(GPU_ARCH)),ampere)
-override GPU_ARCH = 80
-NVCC_FLAGS	+= -DAMPERE_ARCH
-endif
-
-ifeq ($(strip $(GPU_ARCH)),all)
-  # detect based on what nvcc supports
-  ALL_ARCHES = 60 61 62 70 72 75 80 86
-  override GPU_ARCH = $(shell for X in $(ALL_ARCHES) ; do \
-    $(NVCC) -gencode arch=compute_$$X,code=sm_$$X -cuda -x c++ /dev/null -o /dev/null 2> /dev/null && echo $$X; \
-  done)
-endif
-
-# finally, convert space-or-comma separated list of architectures (e.g. 35,50)
-#  into nvcc -gencode arguments
-ifeq ($(findstring nvc++,$(shell $(NVCC) --version)),nvc++)
-NVCC_FLAGS += $(foreach X,$(subst $(COMMA), ,$(GPU_ARCH)),-gpu=cc$(X))
-else
-COMMA=,
-NVCC_FLAGS += $(foreach X,$(subst $(COMMA), ,$(GPU_ARCH)),-gencode arch=compute_$(X)$(COMMA)code=sm_$(X))
-endif
-
-#ifndef HDF5
-#HDF5_inc	?= /usr/include/hdf5/serial
-#HDF5_lib	?= /usr/lib/x86_64-linux-gnu/hdf5/serial
-#INC_FLAGS	+= -I${HDF5}/
-#LD_FLAGS	+= -L${HDF5_lib} -lhdf5
-#endif
-
-###########################################################################
-#
-#   Don't change anything below here
-#
-###########################################################################
-
-include $(LG_RT_DIR)/runtime.mk
diff --git a/INSTALL.md b/INSTALL.md
deleted file mode 100644
index 158bf4a677..0000000000
--- a/INSTALL.md
+++ /dev/null
@@ -1,140 +0,0 @@
-# Installing FlexFlow
-To build and install FlexFlow, follow the instructions below.
-
-## 1. Download the source code
-Clone the FlexFlow source code, and the third-party dependencies from GitHub.
-```
-git clone --recursive https://github.com/flexflow/FlexFlow.git
-```
-
-## 2. Install system dependencies
-FlexFlow has system dependencies on cuda and/or rocm depending on which gpu backend you target. The gpu backend is configured by the cmake variable `FF_GPU_BACKEND`. By default, FlexFlow targets CUDA. `docker/base/Dockerfile` installs system dependencies in a standard ubuntu system.
-
-### Targeting CUDA - `FF_GPU_BACKEND=cuda`
-If you are targeting CUDA, FlexFlow requires CUDA and CUDNN to be installed. You can follow the standard nvidia installation instructions [CUDA](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html) and [CUDNN](https://docs.nvidia.com/deeplearning/cudnn/install-guide/index.html).
-
-Disclaimer: CUDA architectures < 60 (Maxwell and older) are no longer supported.
-
-### Targeting ROCM - `FF_GPU_BACKEND=hip_rocm`
-If you are targeting ROCM, FlexFlow requires a ROCM and HIP installation with a few additional packages. Note that this can be done on a system with or without an AMD GPU. You can follow the standard installation instructions [ROCM](https://docs.amd.com/bundle/ROCm-Installation-Guide-v5.3/page/Introduction_to_ROCm_Installation_Guide_for_Linux.html) and [HIP](https://docs.amd.com/bundle/HIP-Installation-Guide-v5.3/page/Introduction_to_HIP_Installation_Guide.html). When running `amdgpu-install`, install the use cases hip and rocm. You can avoid installing the kernel drivers (not necessary on systems without an AMD graphics card) with `--no-dkms` I.e. `amdgpu-install --usecase=hip,rocm --no-dkms`. Additionally, install the packages `hip-dev`, `hipblas`, `miopen-hip`, and `rocm-hip-sdk`.
-
-See `./docker/base/Dockerfile` for an example ROCM install.
-
-### Targeting CUDA through HIP - `FF_GPU_BACKEND=hip_cuda`
-This is not currently supported.
-
-## 3. Install the Python dependencies
-If you are planning to build the Python interface, you will need to install several additional Python libraries, please check [this](https://github.com/flexflow/FlexFlow/blob/master/requirements.txt) for details. If you are only looking to use the C++ interface, you can skip to the next section.
-
-**We recommend that you create your own `conda` environment and then install the Python dependencies, to avoid any version mismatching with your system pre-installed libraries.** 
-
-The `conda` environment can be created and activated as:
-```
-conda env create -f conda/environment.yml
-conda activate flexflow
-export LD_LIBRARY_PATH="$CONDA_PREFIX/lib:$LD_LIBRARY_PATH"
-```
-
-## 4. Configuring the FlexFlow build
-To start off, create a build directory: in the root directory of the cloned FlexFlow repository, run 
-```
-mkdir build
-cd build
-```
-
-You can configure a FlexFlow build by running the `config/config.linux` file in the build folder. If you do not want to build with the default options, you can set your configurations by passing (or exporting) the relevant environment variables. We recommend that you spend some time familiarizing with the available options by scanning the `config/config.linux` file. In particular, the main parameters are:
-
-1. `CUDA_DIR` is used to specify the directory of CUDA. It is only required when CMake can not automatically detect the installation directory of CUDA.
-2. `CUDNN_DIR` is used to specify the directory of CUDNN. It is only required when CUDNN is not installed in the CUDA directory.
-3. `FF_CUDA_ARCH` is used to set the architecture of targeted GPUs, for example, the value can be 60 if the GPU architecture is Pascal. To build for more than one architecture, pass a list of comma separated values (e.g. `FF_CUDA_ARCH=70,75`). To compile FlexFlow for all GPU architectures that are detected on the machine, pass `FF_CUDA_ARCH=autodetect` (this is the default value, so you can also leave `FF_CUDA_ARCH` unset. If you want to build for all GPU architectures compatible with FlexFlow, pass `FF_CUDA_ARCH=all`. **If your machine does not have any GPU, you have to set FF_CUDA_ARCH to at least one valid architecture code (or `all`)**, since the compiler won't be able to detect the architecture(s) automatically.
-4. `FF_USE_PYTHON` controls whether to build the FlexFlow Python interface.
-5. `FF_USE_NCCL` controls whether to build FlexFlow with NCCL support. By default, it is set to ON.
-6. `FF_LEGION_NETWORKS` is used to enable distributed run of FlexFlow. If you want to run FlexFlow on multiple nodes, follow instructions in [MULTI-NODE.md](MULTI-NODE.md) and set the corresponding parameters as follows:
-* To build FlexFlow with GASNet, set `FF_LEGION_NETWORKS=gasnet` and `FF_GASNET_CONDUIT` as a specific conduit (e.g. `ibv`, `mpi`, `udp`, `ucx`) in `config/config.linux` when configuring the FlexFlow build. Set `FF_UCX_URL` when you want to customize the URL to download UCX.
-* To build FlexFlow with native UCX, set `FF_LEGION_NETWORKS=ucx` in `config/config.linux` when configuring the FlexFlow build. Set `FF_UCX_URL` when you want to customize the URL to download UCX.
-8. `FF_BUILD_EXAMPLES` controls whether to build all C++ example programs.
-9. `FF_MAX_DIM` is used to set the maximum dimension of tensors, by default it is set to 4. 
-10. `FF_USE_{NCCL,LEGION,ALL}_PRECOMPILED_LIBRARY`, controls whether to build FlexFlow using a pre-compiled version of the Legion, NCCL (if `FF_USE_NCCL` is `ON`), or both libraries . By default, `FF_USE_NCCL_PRECOMPILED_LIBRARY` and `FF_USE_LEGION_PRECOMPILED_LIBRARY` are both set to `ON`, allowing you to build FlexFlow faster. If you want to build Legion and NCCL from source, set them to `OFF`.
-
-For more options, run `ccmake ..` or `cmake -LH ..` from the build directory. 
-You will likely only need to modify options starting with `FF_` or `CMAKE_`, though it is not impossible that in strange situtations other options may be necessary.
-
-## 5. Build FlexFlow
-You can build FlexFlow in three ways: with CMake, with Make, and with `pip`. We recommend that you use the CMake building system as it will automatically build all C++ dependencies inlcuding NCCL and Legion. 
-
-### Building FlexFlow with CMake
-
-**NOTE: In repo-refactor this is currently the only supported build system. Make and pip currently likely don't work.**
-
-**NOTE: In repo-refactor, currently FF_USE_PYTHON should be set to OFF**
-
-To build FlexFlow with CMake, go to the FlexFlow home directory, and run
-```bash
-cmake .. -DCMAKE_BUILD_TYPE=Release #... any additional options ...
-```
-or 
-```bash
-ccmake ..
-```
-to use the TUI/GUI provided by CMake.
-
-Once either has completed, then run 
-```
-make -j N
-```
-where N is the desired number of threads to use for the build. 
-To build with all cores available on the system, run 
-```
-make -j $(nproc)
-```
-
-### Building FlexFlow with pip
-To build Flexflow with `pip`, run `pip install .` from the FlexFlow home directory. This command will build FlexFlow, and also install the Python interface as a Python module.
-
-### Building FlexFlow with Make
-The Makefile we provide is mainly for development purposes, and may not be fully up to date. To use it, run:
-```
-cd python
-make -j N
-```
-
-## 6. Test FlexFlow
-After building FlexFlow, you can test it to ensure that the build completed without issue, and that your system is ready to run FlexFlow.
-
-### Set the `FF_HOME` environment variable before running FlexFlow. To make it permanent, you can add the following line in ~/.bashrc.
-```
-export FF_HOME=/path/to/FlexFlow
-```
-
-### Run FlexFlow Python examples
-The Python examples are in the [examples/python](https://github.com/flexflow/FlexFlow/tree/master/examples/python). The native, Keras integration and PyTorch integration examples are listed in `native`, `keras` and `pytorch` respectively.
-
-To run the Python examples, you have two options: you can use the `flexflow_python` interpreter, available in the `python` folder, or you can use the native Python interpreter. If you choose to use the native Python interpreter, you should either install FlexFlow, or, if you prefer to build without installing, export the following flags:
-
-* `export PYTHONPATH="${FF_HOME}/python:${FF_HOME}/build/python"`
-* `export FF_USE_NATIVE_PYTHON=1`
-
-**We recommend that you run the** `mnist_mlp` **test under** `native` **using the following cmd to check if FlexFlow has been installed correctly:**
-
-```
-cd "$FF_HOME"
-./python/flexflow_python examples/python/native/mnist_mlp.py -ll:py 1 -ll:gpu 1 -ll:fsize <size of gpu buffer> -ll:zsize <size of zero buffer>
-```
-A script to run all the Python examples is available at `tests/multi_gpu_tests.sh`
-
-### Run FlexFlow C++ examples
-
-The C++ examples are in the [examples/cpp](https://github.com/flexflow/FlexFlow/tree/master/examples/cpp). 
-For example, the AlexNet can be run as:
-```
-./alexnet -ll:gpu 1 -ll:fsize <size of gpu buffer> -ll:zsize <size of zero buffer>
-``` 
-
-Size of buffers is in MBs, e.g. for an 8GB gpu `-ll:fsize 8000`
-
-## 7. Install FlexFlow
-If you built/installed FlexFlow using `pip`, this step is not required. If you built using Make or CMake, install FlexFlow with:
-```
-cd build
-make install
-```
diff --git a/MULTI-NODE.md b/MULTI-NODE.md
deleted file mode 100644
index 78edba62c0..0000000000
--- a/MULTI-NODE.md
+++ /dev/null
@@ -1,29 +0,0 @@
-# Running FlexFlow On Multiple Nodes
-To build, install, and run FlexFlow on multiple nodes, follow the instructions below. We take AWS as an example to present the instructions.
-
-## 1. Spin up instances
-Spin up multiple instances with GPU support. We choose p3.2xlarge with [Deep Learning AMI GPU PyTorch 1.13.1 (Ubuntu 20.04)](https://aws.amazon.com/releasenotes/aws-deep-learning-ami-neuron-pytorch-1-13-ubuntu-20-04/) to simplify the procedure.
-
-Place the instances in a [placement group](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/placement-groups.html) which utilizes `cluster` as strategy to achieve the low-latency network performance.
-
-To enable the communications between instances, you should attach the same security group to all instances and add an inbound rule in the security group to enable all the incoming traffic from the same security group. An example inbound rule is as follows:
-```
-Type: Custom TCP
-Port range: 1 - 65535
-Source: Custom (use the security group ID) 
-```
-
-## 2. Configure and build FlexFlow
-Follow steps 1 to 5 in [INSTALL.md](INSTALL.md) to download the source code, install system dependencies, install the Python dependencies, configure the FlexFlow build, and build FlexFlow **on each instance**. You can skip the step 2 (Install system dependencies) if you have spun up instances with Deep Learning AMI which comes preconfigured with CUDA. Otherwise, you need to install system dependencies on each instance.
-
-## 3. Test FlexFlow
-Follow the step 6 in [INSTALL.md](INSTALL.md) to set environment variables.
-
-A script to run a Python example on multiple nodes is available at `scripts/mnist_mlp_run.sh` and you can run the script using [`mpirun`](https://www.open-mpi.org/doc/current/man1/mpirun.1.php) or [`srun`](https://slurm.schedmd.com/srun.html). For example, to run the script with MPI, you need to first enable non-interactive `ssh` logins (refer to [Open MPI doc](https://docs.open-mpi.org/en/v5.0.0rc9/running-apps/ssh.html)) between instances and then run:
-```
-mpirun --host <host1_private_ip>:<slot1>,<host2_private_ip>:<slot2> -np <num_proc> ./scripts/mnist_mlp_run.sh
-```
-
-If you encounter some errors like `WARNING: Open MPI accepted a TCP connection from what appears to be a
-another Open MPI process but cannot find a corresponding process
-entry for that peer.`, add the parameter `--mca btl_tcp_if_include` in the `mpirun` command. (refer to [stack overflow question](https://stackoverflow.com/questions/15072563/running-mpi-on-two-hosts))
\ No newline at end of file
diff --git a/PR-README.md b/PR-README.md
deleted file mode 100644
index 1da7565903..0000000000
--- a/PR-README.md
+++ /dev/null
@@ -1,50 +0,0 @@
-# methods need to implement
--  [&#x2714;] std::unordered_set<Node> get_sources(DiGraphView const &);(lib/utils/include/utils/graph/algorithms.h)
-- [&#x2714;] std::unordered_set<Node> get_sources(MultiDiGraphView const &);  (lib/utils/include/utils/graph/algorithms.h)
-- [&#x2714;]  ` get_sinks ` lib/utils/include/utils/graph/algorithms.h
-- [ ] all `view_as_joined method` (lib/utils/include/utils/graph/views.h)
-- [&#x2714;] ` MultiDiEdgeQuery query_intersection(MultiDiEdgeQuery const &, MultiDiEdgeQuery const &) `( lib/utils/include/utils/graph/multidigraph.h)
-- [&#x2714;] ` JoinNodeKey::operator== `(lib/utils/include/utils/graph/views.h)
-- [&#x2714;] ` JoinNodeKey(Node const &, LRDirection) `(lib/utils/include/utils/graph/views.h)
-- [ ] `UndirectedGraphView::operator GraphView&() ` (lib/utils/include/utils/graph/digraph.h)
-- [&#x2714;] ` unsafe_view_as_flipped(DiGraphView const&)` (lib/utils/include/utils/graph/views.h)
-- [] `query_intersection(DirectedEdgeQuery const& ` (lib/utils/include/utils/graph/digraph.h)
-- [&#x2714;] ` DiGraphView(std::shared_ptr<IDiGraphView const>  ` (lib/utils/include/utils/graph/digraph.h)
-- [&#x2714;] ` AdjacencyDiGraph(std::size_t, ContentsType)` (lib/utils/include/utils/graph/adjacency_digraph.h)
-- [&#x2714;]  ` operator maybe_owned_ref<IUndirectedGraphView const>() const` (lib/utils/include/utils/graph/undirected.h)
-- [ ] ` MultiDiGraphView unsafe(IMultiDiGraphView const &) ` (lib/utils/include/utils/graph/multidigraph.h)
-- [&#x2714;] `InputMultiDiEdge(std::pair<std::size_t, std::size_t> const &, Node const &, std::size_t const &)` （lib/utils/include/utils/graph/open_graph_interfaces.h）
-- [&#x2714;] `   operator maybe_owned_ref<IMultiDiGraphView const>() const ` (lib/utils/include/utils/graph/multidigraph.h)
-- [ ] ` tl::optional<Node> get_imm_post_dominator(DiGraphView const &, Node const &)` (lib/utils/include/utils/graph/algorithms.h)
-- [&#x2714;] `  UndirectedGraphView(std::shared_ptr<IUndirectedGraphView const>) ` (lib/utils/include/utils/graph/undirected.h)
-- [ ] `    std::vector<std::unordered_set<Node>> get_weakly_connected_components(DiGraphView const &) `(lib/utils/include/utils/graph/algorithms.h) 
-- [ ] `  operator GraphView() const`(lib/utils/include/utils/graph/digraph.h)
-- [&#x2714;] ` NodeQuery query_intersection(NodeQuery const &, NodeQuery const &) ` (lib/utils/include/utils/graph/node.h)
-- [ ] `   operator UndirectedGraphView() const` (lib/utils/include/utils/graph/adjacency_multidigraph.h)
-- [&#x2714;]  `   SplitASTNode(SplitType)` (lib/utils/src/graph/serialparallel_internal.h)
-- [&#x2714;]  `    operator maybe_owned_ref<IUndirectedGraphView const>() const`(lib/utils/src/graph/serialparallel_internal.h)
-- [&#x2714;]  ` std::unordered_set<Node> get_nodes(GraphView const &)` (lib/utils/include/utils/graph/algorithms.h)
-- [&#x2714;] `  std::size_t operator()(::FlexFlow::JoinNodeKey const &) const`(lib/utils/include/utils/graph/views.h)
-- [ ] `MultiDiGraphView unsafe_view_as_multidigraph(DiGraphView const &)`(lib/utils/include/utils/graph/conversions.h)
-- [ ] `DiGraphView unsafe_view_as_contracted(DiGraphView const &, std::unordered_map<Node, Node> const &)`(lib/utils/include/utils/graph/conversions.h)
-- [&#x2714;] `   MultiDiInput(Node const &, size_t) ` (lib/utils/include/utils/graph/labelled_graph_interfaces.h)
-- [ ] ` DiGraphView unsafe(IDiGraphView const &)` (lib/utils/include/utils/graph/digraph.h)
-- [&#x2714;]  `  virtual ~IUndirectedGraphView() ` (lib/utils/include/utils/graph/undirected.h)
-- [&#x2714;] `  bool operator==(DiGraphView const &) const` (lib/utils/include/utils/graph/digraph.h)
-- [ ] `DiGraphView unsafe_view_as_digraph(MultiDiGraphView const &)`(lib/utils/include/utils/graph/conversions.h)
-- [&#x2714;] `   virtual ~IDiGraphView()  `（lib/utils/include/utils/graph/digraph.h）
-- [ ] ` UndirectedGraphView unsafe(IUndirectedGraphView const &)`(lib/utils/include/utils/graph/undirected.h)
-- [ ] `   static GraphView unsafe(IGraphView const &)`(lib/utils/include/utils/graph/node.h)
-- [&#x2714;] `SplitASTNode(SplitType, SplitAST const &, SplitAST const &)`(lib/utils/src/graph/serialparallel_internal.h)
-- [&#x2714;] `   bool operator!=(DiGraphView const &) const `(lib/utils/include/utils/graph/digraph.h)
-- [&#x2714;] `  MultiDiOutput(Node const &, size_t) `(lib/utils/include/utils/graph/labelled_graph_interfaces.h)
-- [&#x2714;] `std::size_t num_nodes(GraphView const &) `(lib/utils/include/utils/graph/algorithms.h)
-- [&#x2714;] `   std::unordered_set<Node> query_nodes(NodeQuery const &) const override`(lib/utils/include/utils/graph/views.h)
-- [ ] `MultiDiSubgraphView::query_nodes(NodeQuery const&) const`(lib/utils/include/utils/graph/undirected.h)
-- [&#x2714;] ` std::unordered_set<Edge> UndirectedGraphView::query_edges(EdgeQuery const &) const `(lib/utils/include/utils/graph/undirected.h)
-- [&#x2714;] ` OutputMultiDiEdge::OutputMultiDiEdge(std::pair<std::size_t, std::size_t> const &, Node const &, std::size_t const &)`(lib/utils/include/utils/graph/open_graph_interfaces.h)
-- [ &#x2714;] ` InputMultiDiEdge::InputMultiDiEdge `lib/utils/include/utils/graph/open_graph_interfaces.h
-- [&#x2714;] `AdjacencyMultiDiGraph::AdjacencyMultiDiGraph(unsigned long ` lib/utils/include/utils/graph/adjacency_multidigraph.h
-- [&#x2714;] `MultiDiInput::MultiDiInput(FlexFlow::Node const&, unsigned long)` (lib/utils/include/utils/graph/labelled_graph_interfaces.h)
-- [&#x2714;] ` MultiDiGraphView::query_edges(MultiDiEdgeQuery const&) const `(lib/utils/include/utils/graph/multidigraph.h)
-- [&#x2714;] `UndirectedGraphView::query_edges(EdgeQuery const ` (lib/utils/include/utils/graph/undirected.h)
\ No newline at end of file
diff --git a/README.md b/README.md
index 216b32fd52..0d56bc46e0 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# flexflow-train
+# FlexFlow Train
 [![clang-format Check](https://github.com/flexflow/flexflow-train/actions/workflows/clang-format-check.yml/badge.svg?branch=master)](https://github.com/flexflow/flexflow-train/actions/workflows/clang-format-check.yml)
 [![per-lib-checks](https://github.com/flexflow/flexflow-train/actions/workflows/per-lib-check.yml/badge.svg)](https://github.com/flexflow/flexflow-train/actions/workflows/per-lib-check.yml)
 [![shell-check](https://github.com/flexflow/flexflow-train/actions/workflows/shell-check.yml/badge.svg)](https://github.com/flexflow/flexflow-train/actions/workflows/shell-check.yml)
@@ -9,7 +9,7 @@
 > You are currently viewing [flexflow-train](https://github.com/flexflow/flexflow-train). 
 > For anything inference/serving-related, go to [flexflow-serve](https://github.com/flexflow/flexflow-serve). 
 
-FlexFlow is a deep learning framework that accelerates distributed DNN training by automatically searching for efficient parallelization strategies. 
+FlexFlow Train is a deep learning framework that accelerates distributed DNN training by automatically searching for efficient parallelization strategies. 
 
 <!--
 FlexFlow provides a drop-in replacement for PyTorch and TensorFlow Keras. Running existing PyTorch and Keras programs in FlexFlow only requires [a few lines of changes to the program](https://flexflow.ai/keras).
@@ -86,9 +86,12 @@ For performance tuning related flags: see [performance autotuning](https://flexf
 
 Please let us know if you encounter any bugs or have any suggestions by [submitting an issue](https://github.com/flexflow/flexflow-train/issues).
 
-We welcome all contributions to FlexFlow from bug fixes to new features and extensions.
+For instructions on how to contribute code to FlexFlow Train, see [CONTRIBUTING.md](./CONTRIBUTING.md).
+
+We welcome all contributions to FlexFlow Train from bug fixes to new features and extensions.
 
 ## Citations
+
 * Colin Unger, Zhihao Jia, Wei Wu, Sina Lin, Mandeep Baines, Carlos Efrain Quintero Narvaez, Vinay Ramakrishnaiah, Nirmal Prajapati, Pat McCormick, Jamaludin Mohd-Yusof, Xi Luo, Dheevatsa Mudigere, Jongsoo Park, Misha Smelyanskiy, and Alex Aiken. [Unity: Accelerating DNN Training Through Joint Optimization of Algebraic Transformations and Parallelization](https://www.usenix.org/conference/osdi22/presentation/unger). In Proceedings of the Symposium on Operating Systems Design and Implementation (OSDI), July 2022. 
 
 * Zhihao Jia, Matei Zaharia, and Alex Aiken. [Beyond Data and Model Parallelism for Deep Neural Networks](https://cs.stanford.edu/~zhihao/papers/sysml19a.pdf). In Proceedings of the 2nd Conference on Machine Learning and Systems (MLSys), April 2019.
@@ -96,7 +99,7 @@ We welcome all contributions to FlexFlow from bug fixes to new features and exte
 * Zhihao Jia, Sina Lin, Charles R. Qi, and Alex Aiken. [Exploring Hidden Dimensions in Parallelizing Convolutional Neural Networks](http://proceedings.mlr.press/v80/jia18a/jia18a.pdf). In Proceedings of the International Conference on Machine Learning (ICML), July 2018.
 
 ## The Team
-FlexFlow is developed and maintained by teams at CMU, Facebook, Los Alamos National Lab, MIT, and Stanford (alphabetically).
+FlexFlow Train is developed and maintained by teams at CMU, Facebook, Los Alamos National Lab, MIT, Stanford, and UCSD (alphabetically).
 
 ## License
-FlexFlow uses Apache License 2.0.
+FlexFlow Train uses Apache License 2.0.
diff --git a/bin/CMakeLists.txt b/bin/CMakeLists.txt
index 1cd7068cfd..212cf68a17 100644
--- a/bin/CMakeLists.txt
+++ b/bin/CMakeLists.txt
@@ -1,9 +1,5 @@
-if(FF_BUILD_MOE OR FF_BUILD_ALL_EXAMPLES)
-  add_subdirectory(examples/cpp/mixture_of_experts)
-endif()
-
 if(FF_BUILD_SUBSTITUTION_TOOL)
-  add_subdirectory(protobuf_to_json)
+  add_subdirectory(protobuf-to-json)
 endif()
 
 if(FF_BUILD_VISUALIZATION_TOOL)
diff --git a/bin/README.md b/bin/README.md
new file mode 100644
index 0000000000..d0b8ccd018
--- /dev/null
+++ b/bin/README.md
@@ -0,0 +1,9 @@
+# bin
+
+This directory contains command-line interfaces for FlexFlow Train and associated tools (all in C++). 
+A short description of each is included below--more information can be found in the `README.md` files
+in each of the corresponding directories (e.g., [here](./export-model-arch/README.md) for `export-model-arch`):
+
+- `export-model-arch`: Exports the model computation graphs defined in the [models](../lib/models/) library as either JSON (for use outside of FlexFlow) or as DOT (for visualization). Can also optionally export the SP decompositions of the computation graphs.
+- `substitution-to-dot`: Converts TASO-generated substitutions from the legacy JSON format ([example](../substitutions/graph_subst_3_v2.json)) into DOT for visualization.
+- `protobuf-to-json`: Converts TASO-generated substitutions from the legacy protobuf format ([example](../substitutions/graph_subst_3_v2.pb)) to the legacy JSON format ([example](../substitutions/graph_subst_3_v2.json)). Will be removed in the future once the substitution generator is integrated natively into FlexFlow Train (tracked in [#351](https://github.com/flexflow/flexflow-train/issues/351)).
diff --git a/bin/arg_parser/CMakeLists.txt b/bin/arg_parser/CMakeLists.txt
deleted file mode 100644
index 910845b896..0000000000
--- a/bin/arg_parser/CMakeLists.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-cmake_minimum_required(VERSION 3.10)
-
-project(arg_parser)
-set(project_target arg_parser)
-
-add_executable(${project_target} arg_parser.cc)
-target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR})
-target_link_libraries(${project_target} -Wl,--whole-archive ${FLEXFLOW_LIBS} -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES})
diff --git a/bin/arg_parser/arg_parser.cc b/bin/arg_parser/arg_parser.cc
deleted file mode 100644
index a1725c3e42..0000000000
--- a/bin/arg_parser/arg_parser.cc
+++ /dev/null
@@ -1,162 +0,0 @@
-#include "ffr/config.h"
-
-void parse(char **argv, int argc, FFConfig &ffconfig) {
-  for (int i = 1; i < argc; i++) {
-    if ((!strcmp(argv[i], "-e")) || (!strcmp(argv[i], "--epochs"))) {
-      ffconfig.epochs = atoi(argv[++i]);
-      continue;
-    }
-    if ((!strcmp(argv[i], "-b")) || (!strcmp(argv[i], "--batch-size"))) {
-      ffconfig.batchSize = atoi(argv[++i]);
-      continue;
-    }
-    if ((!strcmp(argv[i], "--lr")) || (!strcmp(argv[i], "--learning-rate"))) {
-      ffconfig.learningRate = atof(argv[++i]);
-      continue;
-    }
-    if ((!strcmp(argv[i], "--wd")) || (!strcmp(argv[i], "--weight-decay"))) {
-      ffconfig.weightDecay = atof(argv[++i]);
-      continue;
-    }
-    if ((!strcmp(argv[i], "-p")) || (!strcmp(argv[i], "--print-freq"))) {
-      ffconfig.printFreq = atoi(argv[++i]);
-      continue;
-    }
-    if ((!strcmp(argv[i], "-d")) || (!strcmp(argv[i], "--dataset"))) {
-      ffconfig.dataset_path = std::string(argv[++i]);
-      continue;
-    }
-    if ((!strcmp(argv[i], "--budget")) ||
-        (!strcmp(argv[i], "--search-budget"))) {
-      ffconfig.search_budget = (size_t)atoll(argv[++i]);
-      continue;
-    }
-    if ((!strcmp(argv[i], "--alpha")) || (!strcmp(argv[i], "--search-alpha"))) {
-      ffconfig.search_alpha = atof(argv[++i]);
-      continue;
-    }
-    if (!strcmp(argv[i], "--simulator-workspace-size")) {
-      ffconfig.simulator_work_space_size = atoll(argv[++i]);
-      continue;
-    }
-    if ((!strcmp(argv[i], "--import")) ||
-        (!strcmp(argv[i], "--import-strategy"))) {
-      ffconfig.import_strategy_file = std::string(argv[++i]);
-      continue;
-    }
-    if ((!strcmp(argv[i], "--export")) ||
-        (!strcmp(argv[i], "--export-strategy"))) {
-      ffconfig.export_strategy_file = std::string(argv[++i]);
-      continue;
-    }
-    if ((!strcmp(argv[i], "--only-data-parallel"))) {
-      ffconfig.only_data_parallel = true;
-      continue;
-    }
-    if ((!strcmp(argv[i], "--enable-parameter-parallel"))) {
-      ffconfig.enable_parameter_parallel = true;
-      continue;
-    }
-    if ((!strcmp(argv[i], "--enable-attribute-parallel"))) {
-      ffconfig.enable_parameter_parallel = true;
-      continue;
-    }
-    if (!strcmp(argv[i], "-ll:gpu")) {
-      ffconfig.workersPerNode = gpus_per_node = atoi(argv[++i]);
-      continue;
-    }
-    if (!strcmp(argv[i], "--nodes")) {
-      fprintf(stderr,
-              "[Warning] --nodes is deprecated. "
-              "FlexFlow will automatically detect the number of nodes.\n");
-      ffconfig.numNodes = atoi(argv[++i]);
-      continue;
-    }
-    if (!strcmp(argv[i], "-ll:cpu")) {
-      ffconfig.cpusPerNode = cpus_per_node = atoi(argv[++i]);
-      continue;
-    }
-    if (!strcmp(argv[i], "--profiling")) {
-      ffconfig.profiling = true;
-      continue;
-    }
-    if (!strcmp(argv[i], "--allow-tensor-op-math-conversion")) {
-      ffconfig.allow_tensor_op_math_conversion = true;
-      continue;
-    }
-    if (!strcmp(argv[i], "--fusion")) {
-      ffconfig.perform_fusion = true;
-      continue;
-    }
-    if (!strcmp(argv[i], "--overlap")) {
-      ffconfig.search_overlap_backward_update = true;
-      continue;
-    }
-    if (!strcmp(argv[i], "--taskgraph")) {
-      ffconfig.export_strategy_task_graph_file = std::string(argv[++i]);
-      continue;
-    }
-    if (!strcmp(argv[i], "--include-costs-dot-graph")) {
-      ffconfig.include_costs_dot_graph = true;
-      continue;
-    }
-    if (!strcmp(argv[i], "--compgraph")) {
-      ffconfig.export_strategy_computation_graph_file = std::string(argv[++i]);
-      continue;
-    }
-    if (!strcmp(argv[i], "--machine-model-version")) {
-      ffconfig.machine_model_version = atoi(argv[++i]);
-      continue;
-    }
-    if (!strcmp(argv[i], "--machine-model-file")) {
-      ffconfig.machine_model_file = std::string(argv[++i]);
-      continue;
-    }
-    if (!strcmp(argv[i], "--simulator-segment-size")) {
-      ffconfig.simulator_segment_size = atoi(argv[++i]);
-      continue;
-    }
-    if (!strcmp(argv[i], "--simulator-max-num-segments")) {
-      ffconfig.simulator_max_num_segments = atoi(argv[++i]);
-      continue;
-    }
-    if (!strcmp(argv[i], "--enable-propagation")) {
-      ffconfig.enable_propagation = true;
-      continue;
-    }
-    if (!strcmp(argv[i], "--enable-inplace-optimizations")) {
-      ffconfig.enable_inplace_optimizations = true;
-      continue;
-    }
-    if (!strcmp(argv[i], "--search-num-nodes")) {
-      ffconfig.search_num_nodes = atoi(argv[++i]);
-      continue;
-    }
-    if (!strcmp(argv[i], "--search-num-workers")) {
-      ffconfig.search_num_workers = atoi(argv[++i]);
-      continue;
-    }
-    if (!strcmp(argv[i], "--base-optimize-threshold")) {
-      ffconfig.base_optimize_threshold = atoi(argv[++i]);
-      continue;
-    }
-    if (!strcmp(argv[i], "--disable-control-replication")) {
-      ffconfig.enable_control_replication = false;
-      continue;
-    }
-    if (!strcmp(argv[i], "--python-data-loader-type")) {
-      ffconfig.python_data_loader_type = atoi(argv[++i]);
-      continue;
-    }
-    if (!strcmp(argv[i], "--substitution-json")) {
-      ffconfig.substitution_json_path = std::string(argv[++i]);
-      continue;
-    }
-  }
-}
-
-int main(char **argv, int argc) {
-  FFConfig ffconfig;
-  parse(argv, argc, ffconfig);
-  return 0;
-}
diff --git a/bin/export-model-arch/README.md b/bin/export-model-arch/README.md
new file mode 100644
index 0000000000..80b6c3ef04
--- /dev/null
+++ b/bin/export-model-arch/README.md
@@ -0,0 +1,27 @@
+# export-model-arch
+
+A tool for exporting and visualizing the model computation graphs defined in [models](../lib/models).
+To build and run `export-model-arch`, run the following commands from the root of the FlexFlow Train repository:
+```console
+$ proj cmake # if you haven't already
+...
+$ proj build
+...
+$ ./build/normal/bin/export-model-arch/export-model-arch -h
+```
+The above should print the help message for `export-model-arch`. A few example commands are also listed below:
+
+- Export the `split_test` model in JSON (e.g., for processing outside of FlexFlow Train):
+```console
+$ ./build/normal/bin/export-model-arch/export-model-arch split_test
+```
+
+- Export the `split_test` model in JSON along with the SP decomposition of the model's computation graph:
+```console
+$ ./build/normal/bin/export-model-arch/export-model-arch --sp-decomposition split_test
+```
+
+- Export the `split_test` model as DOT (e.g., for visualization using a [local](https://github.com/jrfonseca/xdot.py) or [web-based](https://dreampuf.github.io/GraphvizOnline/) DOT viewer)
+```console
+$ ./build/normal/bin/export-model-arch/export-model-arch --dot split_test
+```
diff --git a/bin/protobuf_to_json/CMakeLists.txt b/bin/protobuf-to-json/CMakeLists.txt
similarity index 100%
rename from bin/protobuf_to_json/CMakeLists.txt
rename to bin/protobuf-to-json/CMakeLists.txt
diff --git a/bin/protobuf-to-json/README.md b/bin/protobuf-to-json/README.md
new file mode 100644
index 0000000000..a1b1406e8b
--- /dev/null
+++ b/bin/protobuf-to-json/README.md
@@ -0,0 +1,3 @@
+# protobuf-to-json
+
+TODO
diff --git a/bin/protobuf_to_json/protobuf_to_json.cc b/bin/protobuf-to-json/protobuf_to_json.cc
similarity index 100%
rename from bin/protobuf_to_json/protobuf_to_json.cc
rename to bin/protobuf-to-json/protobuf_to_json.cc
diff --git a/bin/protobuf_to_json/rules.proto b/bin/protobuf-to-json/rules.proto
similarity index 100%
rename from bin/protobuf_to_json/rules.proto
rename to bin/protobuf-to-json/rules.proto
diff --git a/bin/substitution-to-dot/README.md b/bin/substitution-to-dot/README.md
new file mode 100644
index 0000000000..931c3cbdd3
--- /dev/null
+++ b/bin/substitution-to-dot/README.md
@@ -0,0 +1,3 @@
+# substitution-to-dot
+
+TODO
diff --git a/python/CMakeLists.txt b/bindings/python/old/CMakeLists.txt
similarity index 100%
rename from python/CMakeLists.txt
rename to bindings/python/old/CMakeLists.txt
diff --git a/python/Makefile b/bindings/python/old/Makefile
similarity index 100%
rename from python/Makefile
rename to bindings/python/old/Makefile
diff --git a/python/flexflow/__init__.py b/bindings/python/old/flexflow/__init__.py
similarity index 100%
rename from python/flexflow/__init__.py
rename to bindings/python/old/flexflow/__init__.py
diff --git a/python/flexflow/config.py b/bindings/python/old/flexflow/config.py
similarity index 100%
rename from python/flexflow/config.py
rename to bindings/python/old/flexflow/config.py
diff --git a/python/flexflow/core/__init__.py b/bindings/python/old/flexflow/core/__init__.py
similarity index 100%
rename from python/flexflow/core/__init__.py
rename to bindings/python/old/flexflow/core/__init__.py
diff --git a/python/flexflow/core/flexflow_cffi.py b/bindings/python/old/flexflow/core/flexflow_cffi.py
similarity index 100%
rename from python/flexflow/core/flexflow_cffi.py
rename to bindings/python/old/flexflow/core/flexflow_cffi.py
diff --git a/python/flexflow/core/flexflow_logger.py b/bindings/python/old/flexflow/core/flexflow_logger.py
similarity index 100%
rename from python/flexflow/core/flexflow_logger.py
rename to bindings/python/old/flexflow/core/flexflow_logger.py
diff --git a/python/flexflow/core/flexflow_top.py b/bindings/python/old/flexflow/core/flexflow_top.py
similarity index 100%
rename from python/flexflow/core/flexflow_top.py
rename to bindings/python/old/flexflow/core/flexflow_top.py
diff --git a/python/flexflow/driver.py b/bindings/python/old/flexflow/driver.py
similarity index 100%
rename from python/flexflow/driver.py
rename to bindings/python/old/flexflow/driver.py
diff --git a/python/flexflow/keras/__init__.py b/bindings/python/old/flexflow/keras/__init__.py
similarity index 100%
rename from python/flexflow/keras/__init__.py
rename to bindings/python/old/flexflow/keras/__init__.py
diff --git a/python/flexflow/keras/backend/__init__.py b/bindings/python/old/flexflow/keras/backend/__init__.py
similarity index 100%
rename from python/flexflow/keras/backend/__init__.py
rename to bindings/python/old/flexflow/keras/backend/__init__.py
diff --git a/python/flexflow/keras/backend/backend_functions.py b/bindings/python/old/flexflow/keras/backend/backend_functions.py
similarity index 100%
rename from python/flexflow/keras/backend/backend_functions.py
rename to bindings/python/old/flexflow/keras/backend/backend_functions.py
diff --git a/python/flexflow/keras/backend/flexflow_backend.py b/bindings/python/old/flexflow/keras/backend/flexflow_backend.py
similarity index 100%
rename from python/flexflow/keras/backend/flexflow_backend.py
rename to bindings/python/old/flexflow/keras/backend/flexflow_backend.py
diff --git a/python/flexflow/keras/backend/internal.py b/bindings/python/old/flexflow/keras/backend/internal.py
similarity index 100%
rename from python/flexflow/keras/backend/internal.py
rename to bindings/python/old/flexflow/keras/backend/internal.py
diff --git a/python/flexflow/keras/callbacks.py b/bindings/python/old/flexflow/keras/callbacks.py
similarity index 100%
rename from python/flexflow/keras/callbacks.py
rename to bindings/python/old/flexflow/keras/callbacks.py
diff --git a/python/flexflow/keras/datasets/__init__.py b/bindings/python/old/flexflow/keras/datasets/__init__.py
similarity index 100%
rename from python/flexflow/keras/datasets/__init__.py
rename to bindings/python/old/flexflow/keras/datasets/__init__.py
diff --git a/python/flexflow/keras/datasets/cifar.py b/bindings/python/old/flexflow/keras/datasets/cifar.py
similarity index 100%
rename from python/flexflow/keras/datasets/cifar.py
rename to bindings/python/old/flexflow/keras/datasets/cifar.py
diff --git a/python/flexflow/keras/datasets/cifar10.py b/bindings/python/old/flexflow/keras/datasets/cifar10.py
similarity index 100%
rename from python/flexflow/keras/datasets/cifar10.py
rename to bindings/python/old/flexflow/keras/datasets/cifar10.py
diff --git a/python/flexflow/keras/datasets/mnist.py b/bindings/python/old/flexflow/keras/datasets/mnist.py
similarity index 100%
rename from python/flexflow/keras/datasets/mnist.py
rename to bindings/python/old/flexflow/keras/datasets/mnist.py
diff --git a/python/flexflow/keras/datasets/reuters.py b/bindings/python/old/flexflow/keras/datasets/reuters.py
similarity index 100%
rename from python/flexflow/keras/datasets/reuters.py
rename to bindings/python/old/flexflow/keras/datasets/reuters.py
diff --git a/python/flexflow/keras/initializers.py b/bindings/python/old/flexflow/keras/initializers.py
similarity index 100%
rename from python/flexflow/keras/initializers.py
rename to bindings/python/old/flexflow/keras/initializers.py
diff --git a/python/flexflow/keras/layers/__init__.py b/bindings/python/old/flexflow/keras/layers/__init__.py
similarity index 100%
rename from python/flexflow/keras/layers/__init__.py
rename to bindings/python/old/flexflow/keras/layers/__init__.py
diff --git a/python/flexflow/keras/layers/base_layer.py b/bindings/python/old/flexflow/keras/layers/base_layer.py
similarity index 100%
rename from python/flexflow/keras/layers/base_layer.py
rename to bindings/python/old/flexflow/keras/layers/base_layer.py
diff --git a/python/flexflow/keras/layers/convolutional.py b/bindings/python/old/flexflow/keras/layers/convolutional.py
similarity index 100%
rename from python/flexflow/keras/layers/convolutional.py
rename to bindings/python/old/flexflow/keras/layers/convolutional.py
diff --git a/python/flexflow/keras/layers/core.py b/bindings/python/old/flexflow/keras/layers/core.py
similarity index 100%
rename from python/flexflow/keras/layers/core.py
rename to bindings/python/old/flexflow/keras/layers/core.py
diff --git a/python/flexflow/keras/layers/input_layer.py b/bindings/python/old/flexflow/keras/layers/input_layer.py
similarity index 100%
rename from python/flexflow/keras/layers/input_layer.py
rename to bindings/python/old/flexflow/keras/layers/input_layer.py
diff --git a/python/flexflow/keras/layers/merge.py b/bindings/python/old/flexflow/keras/layers/merge.py
similarity index 100%
rename from python/flexflow/keras/layers/merge.py
rename to bindings/python/old/flexflow/keras/layers/merge.py
diff --git a/python/flexflow/keras/layers/normalization.py b/bindings/python/old/flexflow/keras/layers/normalization.py
similarity index 100%
rename from python/flexflow/keras/layers/normalization.py
rename to bindings/python/old/flexflow/keras/layers/normalization.py
diff --git a/python/flexflow/keras/layers/pool.py b/bindings/python/old/flexflow/keras/layers/pool.py
similarity index 100%
rename from python/flexflow/keras/layers/pool.py
rename to bindings/python/old/flexflow/keras/layers/pool.py
diff --git a/python/flexflow/keras/losses.py b/bindings/python/old/flexflow/keras/losses.py
similarity index 100%
rename from python/flexflow/keras/losses.py
rename to bindings/python/old/flexflow/keras/losses.py
diff --git a/python/flexflow/keras/metrics.py b/bindings/python/old/flexflow/keras/metrics.py
similarity index 100%
rename from python/flexflow/keras/metrics.py
rename to bindings/python/old/flexflow/keras/metrics.py
diff --git a/python/flexflow/keras/models/__init__.py b/bindings/python/old/flexflow/keras/models/__init__.py
similarity index 100%
rename from python/flexflow/keras/models/__init__.py
rename to bindings/python/old/flexflow/keras/models/__init__.py
diff --git a/python/flexflow/keras/models/base_model.py b/bindings/python/old/flexflow/keras/models/base_model.py
similarity index 100%
rename from python/flexflow/keras/models/base_model.py
rename to bindings/python/old/flexflow/keras/models/base_model.py
diff --git a/python/flexflow/keras/models/model.py b/bindings/python/old/flexflow/keras/models/model.py
similarity index 100%
rename from python/flexflow/keras/models/model.py
rename to bindings/python/old/flexflow/keras/models/model.py
diff --git a/python/flexflow/keras/models/sequential.py b/bindings/python/old/flexflow/keras/models/sequential.py
similarity index 100%
rename from python/flexflow/keras/models/sequential.py
rename to bindings/python/old/flexflow/keras/models/sequential.py
diff --git a/python/flexflow/keras/models/tensor.py b/bindings/python/old/flexflow/keras/models/tensor.py
similarity index 100%
rename from python/flexflow/keras/models/tensor.py
rename to bindings/python/old/flexflow/keras/models/tensor.py
diff --git a/python/flexflow/keras/optimizers.py b/bindings/python/old/flexflow/keras/optimizers.py
similarity index 100%
rename from python/flexflow/keras/optimizers.py
rename to bindings/python/old/flexflow/keras/optimizers.py
diff --git a/python/flexflow/keras/preprocessing/__init__.py b/bindings/python/old/flexflow/keras/preprocessing/__init__.py
similarity index 100%
rename from python/flexflow/keras/preprocessing/__init__.py
rename to bindings/python/old/flexflow/keras/preprocessing/__init__.py
diff --git a/python/flexflow/keras/preprocessing/sequence.py b/bindings/python/old/flexflow/keras/preprocessing/sequence.py
similarity index 100%
rename from python/flexflow/keras/preprocessing/sequence.py
rename to bindings/python/old/flexflow/keras/preprocessing/sequence.py
diff --git a/python/flexflow/keras/preprocessing/text.py b/bindings/python/old/flexflow/keras/preprocessing/text.py
similarity index 100%
rename from python/flexflow/keras/preprocessing/text.py
rename to bindings/python/old/flexflow/keras/preprocessing/text.py
diff --git a/python/flexflow/keras/regularizers.py b/bindings/python/old/flexflow/keras/regularizers.py
similarity index 100%
rename from python/flexflow/keras/regularizers.py
rename to bindings/python/old/flexflow/keras/regularizers.py
diff --git a/python/flexflow/keras/utils/__init__.py b/bindings/python/old/flexflow/keras/utils/__init__.py
similarity index 100%
rename from python/flexflow/keras/utils/__init__.py
rename to bindings/python/old/flexflow/keras/utils/__init__.py
diff --git a/python/flexflow/keras/utils/data_utils.py b/bindings/python/old/flexflow/keras/utils/data_utils.py
similarity index 100%
rename from python/flexflow/keras/utils/data_utils.py
rename to bindings/python/old/flexflow/keras/utils/data_utils.py
diff --git a/python/flexflow/keras/utils/generic_utils.py b/bindings/python/old/flexflow/keras/utils/generic_utils.py
similarity index 100%
rename from python/flexflow/keras/utils/generic_utils.py
rename to bindings/python/old/flexflow/keras/utils/generic_utils.py
diff --git a/python/flexflow/keras/utils/np_utils.py b/bindings/python/old/flexflow/keras/utils/np_utils.py
similarity index 100%
rename from python/flexflow/keras/utils/np_utils.py
rename to bindings/python/old/flexflow/keras/utils/np_utils.py
diff --git a/python/flexflow/keras_exp/__init__.py b/bindings/python/old/flexflow/keras_exp/__init__.py
similarity index 100%
rename from python/flexflow/keras_exp/__init__.py
rename to bindings/python/old/flexflow/keras_exp/__init__.py
diff --git a/python/flexflow/keras_exp/models/__init__.py b/bindings/python/old/flexflow/keras_exp/models/__init__.py
similarity index 100%
rename from python/flexflow/keras_exp/models/__init__.py
rename to bindings/python/old/flexflow/keras_exp/models/__init__.py
diff --git a/python/flexflow/keras_exp/models/model.py b/bindings/python/old/flexflow/keras_exp/models/model.py
similarity index 100%
rename from python/flexflow/keras_exp/models/model.py
rename to bindings/python/old/flexflow/keras_exp/models/model.py
diff --git a/python/flexflow/keras_exp/models/tensor.py b/bindings/python/old/flexflow/keras_exp/models/tensor.py
similarity index 100%
rename from python/flexflow/keras_exp/models/tensor.py
rename to bindings/python/old/flexflow/keras_exp/models/tensor.py
diff --git a/examples/python/keras/__init__.py b/bindings/python/old/flexflow/onnx/__init__.py
similarity index 100%
rename from examples/python/keras/__init__.py
rename to bindings/python/old/flexflow/onnx/__init__.py
diff --git a/python/flexflow/onnx/model.py b/bindings/python/old/flexflow/onnx/model.py
similarity index 100%
rename from python/flexflow/onnx/model.py
rename to bindings/python/old/flexflow/onnx/model.py
diff --git a/python/flexflow/onnx/__init__.py b/bindings/python/old/flexflow/torch/__init__.py
similarity index 100%
rename from python/flexflow/onnx/__init__.py
rename to bindings/python/old/flexflow/torch/__init__.py
diff --git a/python/flexflow/torch/model.py b/bindings/python/old/flexflow/torch/model.py
similarity index 100%
rename from python/flexflow/torch/model.py
rename to bindings/python/old/flexflow/torch/model.py
diff --git a/python/flexflow/torch/nn/__init__.py b/bindings/python/old/flexflow/torch/nn/__init__.py
similarity index 100%
rename from python/flexflow/torch/nn/__init__.py
rename to bindings/python/old/flexflow/torch/nn/__init__.py
diff --git a/python/flexflow/torch/nn/modules/__init__.py b/bindings/python/old/flexflow/torch/nn/modules/__init__.py
similarity index 100%
rename from python/flexflow/torch/nn/modules/__init__.py
rename to bindings/python/old/flexflow/torch/nn/modules/__init__.py
diff --git a/python/flexflow/torch/nn/modules/module.py b/bindings/python/old/flexflow/torch/nn/modules/module.py
similarity index 100%
rename from python/flexflow/torch/nn/modules/module.py
rename to bindings/python/old/flexflow/torch/nn/modules/module.py
diff --git a/python/flexflow/type.py b/bindings/python/old/flexflow/type.py
similarity index 100%
rename from python/flexflow/type.py
rename to bindings/python/old/flexflow/type.py
diff --git a/python/flexflow_c.cc b/bindings/python/old/flexflow_c.cc
similarity index 100%
rename from python/flexflow_c.cc
rename to bindings/python/old/flexflow_c.cc
diff --git a/python/flexflow_c.h b/bindings/python/old/flexflow_c.h
similarity index 100%
rename from python/flexflow_c.h
rename to bindings/python/old/flexflow_c.h
diff --git a/python/flexflow_cffi_build.py b/bindings/python/old/flexflow_cffi_build.py
similarity index 100%
rename from python/flexflow_cffi_build.py
rename to bindings/python/old/flexflow_cffi_build.py
diff --git a/python/flexflow_dataloader.cc b/bindings/python/old/flexflow_dataloader.cc
similarity index 100%
rename from python/flexflow_dataloader.cc
rename to bindings/python/old/flexflow_dataloader.cc
diff --git a/python/flexflow_dataloader.cpp b/bindings/python/old/flexflow_dataloader.cpp
similarity index 100%
rename from python/flexflow_dataloader.cpp
rename to bindings/python/old/flexflow_dataloader.cpp
diff --git a/python/flexflow_dataloader.cu b/bindings/python/old/flexflow_dataloader.cu
similarity index 100%
rename from python/flexflow_dataloader.cu
rename to bindings/python/old/flexflow_dataloader.cu
diff --git a/python/flexflow_dataloader.h b/bindings/python/old/flexflow_dataloader.h
similarity index 100%
rename from python/flexflow_dataloader.h
rename to bindings/python/old/flexflow_dataloader.h
diff --git a/python/flexflow_python.py b/bindings/python/old/flexflow_python.py
similarity index 100%
rename from python/flexflow_python.py
rename to bindings/python/old/flexflow_python.py
diff --git a/python/jupyter_notebook/README.md b/bindings/python/old/jupyter_notebook/README.md
similarity index 100%
rename from python/jupyter_notebook/README.md
rename to bindings/python/old/jupyter_notebook/README.md
diff --git a/python/jupyter_notebook/flexflow_kernel_nocr.py b/bindings/python/old/jupyter_notebook/flexflow_kernel_nocr.py
similarity index 100%
rename from python/jupyter_notebook/flexflow_kernel_nocr.py
rename to bindings/python/old/jupyter_notebook/flexflow_kernel_nocr.py
diff --git a/python/jupyter_notebook/flexflow_python.json b/bindings/python/old/jupyter_notebook/flexflow_python.json
similarity index 100%
rename from python/jupyter_notebook/flexflow_python.json
rename to bindings/python/old/jupyter_notebook/flexflow_python.json
diff --git a/python/jupyter_notebook/install.py b/bindings/python/old/jupyter_notebook/install.py
similarity index 100%
rename from python/jupyter_notebook/install.py
rename to bindings/python/old/jupyter_notebook/install.py
diff --git a/python/legion_cffi_build.py b/bindings/python/old/legion_cffi_build.py
similarity index 100%
rename from python/legion_cffi_build.py
rename to bindings/python/old/legion_cffi_build.py
diff --git a/python/legion_cffi_header.py.in b/bindings/python/old/legion_cffi_header.py.in
similarity index 100%
rename from python/legion_cffi_header.py.in
rename to bindings/python/old/legion_cffi_header.py.in
diff --git a/python/main.cc b/bindings/python/old/main.cc
similarity index 100%
rename from python/main.cc
rename to bindings/python/old/main.cc
diff --git a/requirements.txt b/bindings/python/old/requirements.txt
similarity index 100%
rename from requirements.txt
rename to bindings/python/old/requirements.txt
diff --git a/setup.py b/bindings/python/old/setup.py
similarity index 100%
rename from setup.py
rename to bindings/python/old/setup.py
diff --git a/cmake/flexflow-utils.cmake b/cmake/flexflow-utils.cmake
index 90e100bb1b..7ba39e92c9 100644
--- a/cmake/flexflow-utils.cmake
+++ b/cmake/flexflow-utils.cmake
@@ -39,7 +39,7 @@ function(ff_set_cxx_properties target)
       CXX_EXTENSIONS NO
   )
   target_compile_options(${target}
-    PRIVATE $<$<COMPILE_LANGUAGE:CXX>:> # add C++ compile flags here
+    PRIVATE $<$<COMPILE_LANGUAGE:CXX>:> "-ffile-prefix-map=${CMAKE_SOURCE_DIR}=." # add C++ compile flags here
   )
 endfunction()
 
diff --git a/codecov.yml b/codecov.yml
index 326788decf..40a819814e 100644
--- a/codecov.yml
+++ b/codecov.yml
@@ -1,5 +1,5 @@
 codecov:
-  branch: repo-refactor
+  branch: master
   notify:
     require_ci_to_pass: false
 
@@ -19,4 +19,4 @@ coverage:
 comment:
   layout: "header, diff, flags, files"
   behavior: default
-  require_changes: no
\ No newline at end of file
+  require_changes: no
diff --git a/config/config.inc b/config/config.inc
deleted file mode 100644
index c70e2e9223..0000000000
--- a/config/config.inc
+++ /dev/null
@@ -1,193 +0,0 @@
-#!/bin/bash
-
-# set CC and CXX
-if [ -n "$CC" ]; then
-  SET_CC="-DCMAKE_C_COMPILER=${CC}"
-fi
-
-if [ -n "$CXX" ]; then
-  SET_CXX="-DCMAKE_CXX_COMPILER=${CXX}"
-fi
-
-# set FLAGS
-if [ -n "$CC_FLAGS" ]; then
-  SET_CC_FLAGS="CC_FLAGS=${CC_FLAGS}"
-fi
-
-if [ -n "$NVCC_FLAGS" ]; then
-  SET_NVCC_FLAGS="NVCC_FLAGS=${NVCC_FLAGS}"
-fi
-
-if [ -n "$LD_FLAGS" ]; then
-  SET_LD_FLAGS="LD_FLAGS=${LD_FLAGS}"
-fi
-
-#set installation dir
-if [ -n "$INSTALL_DIR" ]; then
-  SET_INSTALL_DIR="-DCMAKE_INSTALL_PREFIX=${INSTALL_DIR}"
-fi
-
-# set build type
-if [ -n "$BUILD_TYPE" ]; then
-  SET_BUILD="-DCMAKE_BUILD_TYPE=${BUILD_TYPE}"
-fi
-
-# set CUDA Arch
-if [ -n "$FF_CUDA_ARCH" ]; then
-  SET_CUDA_ARCH="-DFF_CUDA_ARCH=${FF_CUDA_ARCH}"
-fi
-
-# set CUDA dir
-if [ -n "$CUDA_DIR" ]; then
-  SET_CUDA="-DCUDA_PATH=${CUDA_DIR}"
-  CUDA_PATH="${CUDA_DIR}/lib64/stubs"
-  SET_CUDA_LIB_PATH="CUDA_PATH=${CUDA_PATH}"
-fi
-
-# set cudnn dir
-if [ -n "$CUDNN_DIR" ]; then
-  SET_CUDNN="-DCUDNN_PATH=${CUDNN_DIR}"
-fi
-
-# enable Python
-if [ "$FF_USE_PYTHON" = "ON" ]; then
-  SET_PYTHON="-DFF_USE_PYTHON=ON"
-elif [ "$FF_USE_PYTHON" = "OFF" ]; then
-  SET_PYTHON="-DFF_USE_PYTHON=OFF"
-else
-  SET_PYTHON="-DFF_USE_PYTHON=ON"
-fi
-
-# enable NCCL
-if [ "$FF_USE_NCCL" = "ON" ]; then
-  SET_NCCL="-DFF_USE_NCCL=ON"
-elif [ "$FF_USE_NCCL" = "OFF" ]; then
-  SET_NCCL="-DFF_USE_NCCL=OFF"
-else
-  SET_NCCL="-DFF_USE_NCCL=ON"
-fi
-
-# set Legion networks
-if [ "$FF_LEGION_NETWORKS" = "gasnet" ]; then
-  SET_LEGION_NETWORKS+=" -DFF_LEGION_NETWORKS=gasnet"
-  if [ "$FF_GASNET_CONDUIT" = "ibv" ]; then
-    SET_LEGION_NETWORKS+=" -DFF_GASNET_CONDUIT=ibv"
-  elif [ "$FF_GASNET_CONDUIT" = "mpi" ]; then
-    SET_LEGION_NETWORKS+=" -DFF_GASNET_CONDUIT=mpi"
-  elif [ "$FF_GASNET_CONDUIT" = "udp" ]; then
-    SET_LEGION_NETWORKS+=" -DFF_GASNET_CONDUIT=udp"
-  elif [ "$FF_GASNET_CONDUIT" = "ucx" ]; then
-    SET_LEGION_NETWORKS+=" -DFF_GASNET_CONDUIT=ucx"
-    SET_LEGION_NETWORKS+=" -DFF_UCX_URL=$FF_UCX_URL"
-  fi
-elif [ "$FF_LEGION_NETWORKS" = "ucx" ]; then
-  SET_LEGION_NETWORKS+=" -DFF_LEGION_NETWORKS=ucx"
-fi
-
-# build C++ examples
-if [ "$FF_BUILD_ALL_EXAMPLES" = "ON" ]; then
-  SET_EXAMPLES="-DFF_BUILD_ALL_EXAMPLES=ON"
-elif [ "$FF_BUILD_ALL_EXAMPLES" = "OFF" ]; then
-  SET_EXAMPLES="-DFF_BUILD_ALL_EXAMPLES=OFF"
-else
-  SET_EXAMPLES="-DFF_BUILD_ALL_EXAMPLES=ON"
-fi
-
-# enable C++ unit tests
-if [ "$FF_BUILD_UNIT_TESTS" = "ON" ]; then
-  SET_BUILD_UNIT_TESTS="-DFF_BUILD_UNIT_TESTS=ON"
-elif [ "$FF_BUILD_UNIT_TESTS" = "OFF" ]; then
-  SET_BUILD_UNIT_TESTS="-DFF_BUILD_UNIT_TESTS=OFF"
-else
-  SET_BUILD_UNIT_TESTS="-DFF_BUILD_UNIT_TESTS=OFF"
-fi
-
-# build using pre-compiled libraries, where available
-if [ "$FF_USE_PREBUILT_LEGION" = "ON" ]; then
-  SET_USE_PREBUILT_LEGION="-DFF_USE_PREBUILT_LEGION=ON"
-elif [ "$FF_USE_PREBUILT_LEGION" = "OFF" ]; then
-  SET_USE_PREBUILT_LEGION="-DFF_USE_PREBUILT_LEGION=OFF"
-else
-  SET_USE_PREBUILT_LEGION="-DFF_USE_PREBUILT_LEGION=ON"
-fi
-if [ "$FF_USE_PREBUILT_NCCL" = "ON" ]; then
-  SET_USE_PREBUILT_NCCL="-DFF_USE_PREBUILT_NCCL=ON"
-elif [ "$FF_USE_PREBUILT_NCCL" = "OFF" ]; then
-  SET_USE_PREBUILT_NCCL="-DFF_USE_PREBUILT_NCCL=OFF"
-else
-  SET_USE_PREBUILT_NCCL="-DFF_USE_PREBUILT_NCCL=ON"
-fi
-if [ "$FF_USE_ALL_PREBUILT_LIBRARIES" = "ON" ]; then
-  SET_USE_ALL_PREBUILT_LIBRARIES="-DFF_USE_ALL_PREBUILT_LIBRARIES=ON"
-elif [ "$FF_USE_ALL_PREBUILT_LIBRARIES" = "OFF" ]; then
-  SET_USE_ALL_PREBUILT_LIBRARIES="-DFF_USE_ALL_PREBUILT_LIBRARIES=OFF"
-else
-  SET_USE_ALL_PREBUILT_LIBRARIES="-DFF_USE_ALL_PREBUILT_LIBRARIES=OFF"
-fi
-
-
-# enable avx2
-if [ "$FF_USE_AVX2" = "ON" ]; then
-  SET_AVX2="-DFF_USE_AVX2=ON"
-elif [ "$FF_USE_AVX2" = "OFF" ]; then
-  SET_AVX2="-DFF_USE_AVX2=OFF"
-else
-  SET_AVX2="-DFF_USE_AVX2=OFF"
-fi
-
-#set max dims
-if [ -n "$FF_MAX_DIM" ]; then
-  SET_MAX_DIM="-DFF_MAX_DIM=${FF_MAX_DIM}"
-fi
-
-# set ROCM path
-if [ -n "$ROCM_PATH" ]; then
-  SET_ROCM_PATH="-DROCM_PATH=${ROCM_PATH}"
-fi
-
-# set GPU backend
-if [ -n "$FF_GPU_BACKEND" ]; then
-  SET_FF_GPU_BACKEND="-DFF_GPU_BACKEND=${FF_GPU_BACKEND}"
-
-  # cmake does not play nicely with overrides via `set()` of CMAKE_CXX_COMPILER and friends
-  # because it uses their values to setup the toolchain.
-  # see: https://gitlab.kitware.com/cmake/community/-/wikis/FAQ#how-do-i-use-a-different-compiler
-  #
-  # Ideally we would use the values internally to the cmake script, e.g. HIP_HIPCC_EXECUTABLE,
-  # to set these values but this is a sufficient compromise.
-  if [ "$FF_GPU_BACKEND" = "hip_cuda" ] || [ "$FF_GPU_BACKEND" = "hip_rocm" ]; then
-    if [ -n "$SET_CXX" ]; then
-      echo "FF_GPU_BACKEND is set to ${FF_GPU_BACKEND}. Normally we would set the compiler and linker" 1>&2
-      echo "to hipcc, but the compiler is already set to ${SET_CXX}". 1>&2
-    else
-      if [ "$FF_GPU_BACKEND" = "hip_cuda" ]; then
-        # Configuring hipcc for nvidia:
-        #
-        # The platform hipcc targets is configured by the HIP_PLATFORM env var.
-        # Ideally, as we could in the shell, we would call `HIP_PLATFORM=nvidia hipcc <...>`.
-        # However, CMAKE_CXX_COMPILER doesn't allow configuration as such. Additionally,
-        # cmake doesn't allow setting environment variables for target builds like make does
-        # with exported variables.
-        #
-        # Instead, this file configures hipcc with HIP_PLATFORM and CUDA_PATH
-        #
-        # CMAKE requires CMAKE_CXX_COMPILER exists before cmake is called, so we can't
-        # write out this file during build configuration.
-        echo "HIP_PLATFORM=nvidia CUDA_PATH=${CUDA_DIR} ${ROCM_PATH}/bin/hipcc \$@" > "$(pwd)/nvidia_hipcc"
-        chmod +x "$(pwd)/nvidia_hipcc"
-        SET_CXX="-DCMAKE_CXX_COMPILER=$(pwd)/nvidia_hipcc -DCMAKE_CXX_LINKER=$(pwd)/nvidia_hipcc"
-      else
-        SET_CXX="-DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc -DCMAKE_CXX_LINKER=/opt/rocm/bin/hipcc"
-      fi
-    fi
-  fi
-fi
-
-CMAKE_FLAGS="-DCUDA_USE_STATIC_CUDA_RUNTIME=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON ${SET_CC} ${SET_CXX} ${SET_INSTALL_DIR} ${SET_BUILD} ${SET_CUDA_ARCH} ${SET_CUDA} ${SET_CUDNN} ${SET_PYTHON} ${SET_NCCL} ${SET_LEGION_NETWORKS} ${SET_EXAMPLES} ${SET_USE_PREBUILT_LEGION} ${SET_USE_PREBUILT_NCCL} ${SET_USE_ALL_PREBUILT_LIBRARIES} ${SET_BUILD_UNIT_TESTS} ${SET_AVX2} ${SET_MAX_DIM} ${SET_ROCM_PATH} ${SET_FF_GPU_BACKEND}"
-
-function run_cmake() {
-  SRC_LOCATION="${SRC_LOCATION:="$(dirname "$0")/../"}"
-  CMAKE_COMMAND="${SET_CC_FLAGS} ${SET_NVCC_FLAGS} ${SET_LD_FLAGS} ${SET_CUDA_LIB_PATH} cmake ${CMAKE_FLAGS} $* ${SRC_LOCATION}"
-  echo $CMAKE_COMMAND
-  eval $CMAKE_COMMAND
-}
diff --git a/config/config.linux b/config/config.linux
deleted file mode 100755
index 94cb348a5a..0000000000
--- a/config/config.linux
+++ /dev/null
@@ -1,83 +0,0 @@
-#! /usr/bin/env bash
-
-# set the CC and CXX, usually it is not needed as cmake can detect it
-# set CC and CXX to mpicc and mpic++ when enable gasnet
-# CC=mpicc
-# CXX=mpic++
-
-# add flags if needed
-#CC_FLAGS=${CC_FLAGS+=""}
-#NVCC_FLAGS=${NVCC_FLAGS+=""}
-#LD_FLAGS=${LD_FLAGS+=""}
-
-#set install dir
-#INSTALL_DIR=
-
-# set build type
-BUILD_TYPE=${BUILD_TYPE:-Release}
-
-# set CUDA Arch to the desired GPU architecture(s) to target (e.g. pass "FF_CUDA_ARCH=60" for Pascal). 
-# To pass more than one value, separate architecture numbers with a comma (e.g. FF_CUDA_ARCH=70,75).
-# Alternatively, set "FF_CUDA_ARCH=autodetect" to build FlexFlow for all architectures detected on the machine,
-# or set "FF_CUDA_ARCH=all" to build FlexFlow for all supported GPU architectures
-FF_CUDA_ARCH=${FF_CUDA_ARCH:-"autodetect"}
-
-# set CUDNN dir in case cmake cannot autodetect a path
-CUDNN_DIR=${CUDNN_DIR:-"/usr/local/cuda"}
-
-# set CUDA dir in case cmake cannot autodetect a path
-CUDA_DIR=${CUDA_DIR:-"/usr/local/cuda"}
-
-# enable Python
-FF_USE_PYTHON=${FF_USE_PYTHON:-ON}
-
-# set Legion networks
-FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS:-}
-
-# select GASNET conduit
-FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT:-ibv}
-
-# set UCX URL
-FF_UCX_URL=${FF_UCX_URL:-""}
-
-# build C++ examples
-FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES:-OFF}
-
-# build C++ unit tests
-FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS:-OFF}
-
-# use precompiled NCCL and Legion libraries, where available
-FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL:-ON}
-FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION:-OFF}
-# use the flag below to use both the NCCL and Legion pre-built libraries.
-# when the flag below is set to ON, the two flags above are ignored.
-FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES:-OFF}
-
-# enable avx2
-FF_USE_AVX2=${FF_USE_AVX2:-OFF}
-
-# set MAX_DIM
-FF_MAX_DIM=${FF_MAX_DIM:-5}
-
-# set ROCM path
-ROCM_PATH=${ROCM_PATH:-"/opt/rocm"}
-
-# set GPU backend
-FF_GPU_BACKEND=${FF_GPU_BACKEND:-cuda}
-if [[ "${FF_GPU_BACKEND}" != @(cuda|hip_cuda|hip_rocm|intel) ]]; then
-  echo "Error, value of FF_GPU_BACKEND (${FF_GPU_BACKEND}) is invalid."
-  exit 1
-elif [[ "$FF_GPU_BACKEND" == "cuda" || "$FF_GPU_BACKEND" = "hip_cuda" ]]; then
-    # enable NCCL
-    FF_USE_NCCL=${FF_USE_NCCL:-ON}
-else
-    FF_USE_NCCL=OFF
-fi
-
-function get_build_configs() {
-    # Create a string with the values of the variables set in this script
-    BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} CUDNN_DIR=${CUDNN_DIR} CUDA_DIR=${CUDA_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} FF_UCX_URL=${FF_UCX_URL} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS} FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND}"
-}
-
-. $(dirname $0)/config.inc
-run_cmake $*
diff --git a/docs/SAPLING.md b/docs/SAPLING.md
new file mode 100644
index 0000000000..14ff39322b
--- /dev/null
+++ b/docs/SAPLING.md
@@ -0,0 +1,3 @@
+# Setup Guide for sapling
+
+TODO (tracked in [#1572](https://github.com/flexflow/flexflow-train/issues/1572))
diff --git a/examples/c/mlp/mlp.c b/examples/c/mlp/mlp.c
deleted file mode 100644
index ce213a8f98..0000000000
--- a/examples/c/mlp/mlp.c
+++ /dev/null
@@ -1,20 +0,0 @@
-#include "flexflow/flexflow.h"
-#include <stdbool.h>
-
-int top_level_task_impl(int argc, char **argv) {
-  flexflow_config_t config;
-  CHECK_FLEXFLOW(flexflow_config_parse_argv(&argc, argv, true, &config));
-  CHECK_FLEXFLOW(flexflow_set_config(config));
-
-  flexflow_model_config_t model_config;
-  CHECK_FLEXFLOW(flexflow_model_config_parse_argv(&argc, argv, true, &config));
-
-  flexflow_computation_graph_t cg;
-  CHECK_FLEXFLOW(flexflow_computation_graph_create(&cg));
-  CHECK_FLEXFLOW(flexflow_set_model_config(cg, model_config));
-
-  flexflow_tensor_t tensor;
-  int dims[] = {5, 10, 16};
-  CHECK_FLEXFLOW(flexflow_computation_graph_create_tensor(
-      cg, 3, dims, FLEXFLOW_DATATYPE_FLOAT, true, &tensor));
-}
diff --git a/examples/cpp/AlexNet/CMakeLists.txt b/examples/cpp/AlexNet/CMakeLists.txt
deleted file mode 100644
index 699148d7bd..0000000000
--- a/examples/cpp/AlexNet/CMakeLists.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-cmake_minimum_required(VERSION 3.10)
-
-project(FlexFlowExample_AlexNet)
-set(project_target alexnet)
-
-set(CPU_SRC
-  ${FLEXFLOW_CPP_DRV_SRC}
-  alexnet.cc
-  alexnet.h)
-
-set(GPU_SRC
-  alexnet.cu)
-
-cuda_add_executable(${project_target} ${CPU_SRC} ${GPU_SRC})
-target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR})
-target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES})
-
-set(BIN_DEST "bin")
-install(TARGETS ${project_target} DESTINATION ${BIN_DEST})
diff --git a/examples/cpp/AlexNet/Makefile b/examples/cpp/AlexNet/Makefile
deleted file mode 100644
index 3fb25d1115..0000000000
--- a/examples/cpp/AlexNet/Makefile
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) University
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Flags for directing the runtime makefile what to include
-DEBUG           ?= 1		# Include debugging symbols
-MAX_DIM         ?= 4		# Maximum number of dimensions
-OUTPUT_LEVEL    ?= LEVEL_DEBUG	# Compile time logging level
-USE_CUDA        ?= 1		# Include CUDA support (requires CUDA)
-USE_GASNET      ?= 0		# Include GASNet support (requires GASNet)
-USE_HDF         ?= 0		# Include HDF5 support (requires HDF5)
-ALT_MAPPERS     ?= 0		# Include alternative mappers (not recommended)
-
-# Put the binary file name here
-OUTFILE		?= alexnet
-# List all the application source files here
-GEN_SRC		= alexnet_new.cc
-GEN_GPU_SRC	= alexnet.cu
-
-ifndef FF_HOME
-$(error FF_HOME variable is not defined, aborting build)
-endif
-
-include $(FF_HOME)/FlexFlow.mk
diff --git a/examples/cpp/AlexNet/alexnet.cc b/examples/cpp/AlexNet/alexnet.cc
deleted file mode 100644
index d2e4a669c9..0000000000
--- a/examples/cpp/AlexNet/alexnet.cc
+++ /dev/null
@@ -1,434 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "alexnet.h"
-#include <fstream>
-#include <sstream>
-#include <string>
-using namespace Legion;
-using FlexFlow::FFConfig;
-using FlexFlow::FFModel;
-using FlexFlow::Optimizer;
-using FlexFlow::ParallelDim;
-using FlexFlow::ParallelTensor;
-using FlexFlow::SGDOptimizer;
-using FlexFlow::Tensor;
-
-LegionRuntime::Logger::Category log_app("AlexNet");
-
-void parse_input_args(char **argv, int argc, AlexNetConfig &config) {
-  for (int i = 1; i < argc; i++) {
-    if (!strcmp(argv[i], "--dataset")) {
-      std::strcpy(config.dataset_path, argv[++i]);
-      continue;
-    }
-  }
-}
-
-void FlexFlow::top_level_task(Task const *task,
-                              std::vector<PhysicalRegion> const &regions,
-                              Context ctx,
-                              Runtime *runtime) {
-  FFConfig ffConfig;
-  AlexNetConfig alexnetConfig;
-  {
-    InputArgs const &command_args = HighLevelRuntime::get_input_args();
-    char **argv = command_args.argv;
-    int argc = command_args.argc;
-    parse_input_args(argv, argc, alexnetConfig);
-    log_app.print("batchSize(%d) workersPerNodes(%d) numNodes(%d)",
-                  ffConfig.batchSize,
-                  ffConfig.workersPerNode,
-                  ffConfig.numNodes);
-  }
-  FFModel ff(ffConfig);
-
-  Tensor input;
-  {
-    int const dims[] = {ffConfig.batchSize, 3, 229, 229};
-    input = ff.create_tensor<4>(dims, DT_FLOAT);
-  }
-  // Tensor label;
-  //{
-  //   const int dims[] = {ffConfig.batchSize, 1};
-  //   label = ff.create_tensor<2>(dims, DT_INT32);
-  // }
-  //  Add layers
-  Tensor t = input, ts[2];
-  t = ff.conv2d(input, 64, 11, 11, 4, 4, 2, 2, AC_MODE_RELU);
-  // ts[1] = ff.conv2d("conv1", input, 64, 11, 11, 4, 4, 2, 2);
-  // t = ff.concat("concat", 2, ts, 1/*axis*/);
-  t = ff.pool2d(t, 3, 3, 2, 2, 0, 0);
-  t = ff.conv2d(t, 192, 5, 5, 1, 1, 2, 2, AC_MODE_RELU);
-  t = ff.pool2d(t, 3, 3, 2, 2, 0, 0);
-  t = ff.conv2d(t, 384, 3, 3, 1, 1, 1, 1, AC_MODE_RELU);
-  t = ff.conv2d(t, 256, 3, 3, 1, 1, 1, 1, AC_MODE_RELU);
-  t = ff.conv2d(t, 256, 3, 3, 1, 1, 1, 1, AC_MODE_RELU);
-  t = ff.pool2d(t, 3, 3, 2, 2, 0, 0);
-  t = ff.flat(t);
-  t = ff.dense(t, 4096, AC_MODE_RELU /*relu*/);
-  t = ff.dense(t, 4096, AC_MODE_RELU /*relu*/);
-  t = ff.dense(t, 10);
-  t = ff.softmax(t);
-  Optimizer *optimizer = new SGDOptimizer(&ff, 0.001f);
-  std::vector<MetricsType> metrics;
-  metrics.push_back(METRICS_ACCURACY);
-  metrics.push_back(METRICS_SPARSE_CATEGORICAL_CROSSENTROPY);
-  ff.compile(optimizer, LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, metrics);
-  // Data Loader
-  ParallelTensor input_pt, label_pt;
-  ff.get_parallel_tensor_from_tensor(input, input_pt);
-  ff.get_parallel_tensor_from_tensor(ff.label_tensor, label_pt);
-  DataLoader data_loader(ff, &alexnetConfig, input_pt, label_pt);
-  ff.init_operators();
-  // Start timer
-  {
-    runtime->issue_execution_fence(ctx);
-    TimingLauncher timer(MEASURE_MICRO_SECONDS);
-    Future future = runtime->issue_timing_measurement(ctx, timer);
-    future.get_void_result();
-  }
-  double ts_start = Realm::Clock::current_time_in_microseconds();
-  for (int epoch = 0; epoch < ffConfig.epochs; epoch++) {
-    data_loader.reset();
-    ff.reset_metrics();
-    int iterations = data_loader.num_samples / ffConfig.batchSize;
-
-    for (int iter = 0; iter < iterations; iter++) {
-      if (std::strlen(alexnetConfig.dataset_path) == 0) {
-        // Only load data once for random input
-        if (iter == 0 && epoch == 0) {
-          data_loader.next_batch(ff);
-        }
-      } else {
-        data_loader.next_batch(ff);
-      }
-      runtime->begin_trace(ctx, 111 /*trace_id*/);
-      ff.forward();
-      ff.zero_gradients();
-      ff.backward();
-      ff.update();
-      runtime->end_trace(ctx, 111 /*trace_id*/);
-    }
-  }
-  // End timer
-  {
-    runtime->issue_execution_fence(ctx);
-    TimingLauncher timer(MEASURE_MICRO_SECONDS);
-    Future future = runtime->issue_timing_measurement(ctx, timer);
-    future.get_void_result();
-  }
-  double ts_end = Realm::Clock::current_time_in_microseconds();
-  double run_time = 1e-6 * (ts_end - ts_start);
-  printf("ELAPSED TIME = %.4fs, THROUGHPUT = %.2f samples/s\n",
-         run_time,
-         data_loader.num_samples * ffConfig.epochs / run_time);
-}
-
-size_t get_file_size(std::string const &filename) {
-  streampos begin, end;
-  ifstream file(filename.c_str(), ios::binary);
-  begin = file.tellg();
-  file.seekg(0, ios::end);
-  end = file.tellg();
-  file.close();
-  size_t filesize = end - begin;
-  return filesize;
-}
-
-DataLoader::DataLoader(FFModel &ff,
-                       AlexNetConfig const *alexnet,
-                       ParallelTensor input,
-                       ParallelTensor label) {
-  Context ctx = ff.config.lg_ctx;
-  Runtime *runtime = ff.config.lg_hlr;
-  num_samples = 0;
-  if (std::strlen(alexnet->dataset_path) == 0) {
-    log_app.print("Use random dataset...");
-    num_samples = 1024 * ff.config.workersPerNode * ff.config.numNodes;
-    log_app.print("Number of random samples = %d\n", num_samples);
-  } else {
-    log_app.print("Start loading dataset from %s", alexnet->dataset_path);
-    size_t filesize = get_file_size(alexnet->dataset_path);
-    assert(filesize % 3073 == 0);
-    num_samples = filesize / 3073;
-  }
-
-  // Create full input
-  {
-    assert(input->num_dims == 5);
-    batch_input = input;
-    ParallelDim dims[5];
-    for (int i = 0; i < 5; i++) {
-      dims[i].size = input->dims[i].size;
-      dims[i].degree = 1;
-      dims[i].parallel_idx = -1;
-      dims[i].is_replica_dim = input->dims[i].is_replica_dim;
-      // Assume only the first dim can be the replica dim
-      assert(i == 4 || (!dims[i].is_replica_dim));
-    }
-    dims[3].size = num_samples;
-    // int const dims[] = {
-    //     num_samples, input->dims[2], input->dims[1], input->dims[0]};
-    full_input = ff.create_parallel_tensor_legion_ordering(5, dims, DT_FLOAT);
-    ff.map_tensor(full_input, NULL /*parallel_op*/);
-  }
-  // Create full label
-  {
-    assert(label->num_dims == 3);
-    batch_label = label;
-    ParallelDim dims[3];
-    for (int i = 0; i < 3; i++) {
-      dims[i].size = label->dims[i].size;
-      dims[i].degree = 1;
-      dims[i].parallel_idx = -1;
-      // Assume only the first dim can be the replica dim
-      assert(i == 2 || (!dims[i].is_replica_dim));
-    }
-    dims[1].size = num_samples;
-    // int const dims[] = {num_samples, label->dims[0]};
-    full_label = ff.create_parallel_tensor_legion_ordering(3, dims, DT_INT32);
-    ff.map_tensor(full_label, NULL);
-  }
-  // Load entire dataset
-  // TODO: Use index launcher instead of task launcher
-  TaskLauncher launcher(FlexFlow::CUSTOM_CPU_TASK_ID_1,
-                        TaskArgument(alexnet, sizeof(AlexNetConfig)));
-  // regions[0]: full_input
-  launcher.add_region_requirement(RegionRequirement(full_input->region,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    full_input->region,
-                                                    MAP_TO_ZC_MEMORY));
-  launcher.add_field(0, FID_DATA);
-  // regions[1]: full_label
-  launcher.add_region_requirement(RegionRequirement(full_label->region,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    full_label->region,
-                                                    MAP_TO_ZC_MEMORY));
-  launcher.add_field(1, FID_DATA);
-  runtime->execute_task(ctx, launcher);
-  reset();
-  next_batch(ff);
-}
-
-__inline__ int calc_offset(int c, int y, int x, int yscale, int xscale) {
-  return (c * yscale * xscale + y * xscale + x);
-}
-
-void nearest_neigh(unsigned char *image,
-                   unsigned char *buffer,
-                   int height,
-                   int width,
-                   int orig_height,
-                   int orig_width,
-                   float height_scale,
-                   float width_scale) {
-  for (int y = 0; y < height; y++) {
-    int y0 =
-        std::min(static_cast<int>(roundf(y * height_scale)), orig_height - 1);
-    for (int x = 0; x < width; x++) {
-      int x0 =
-          std::min(static_cast<int>(roundf(x * width_scale)), orig_width - 1);
-      for (int c = 0; c < 3; c++) {
-        int origOffset = calc_offset(y0, x0, c, orig_width, 3);
-        int offset = calc_offset(c, y, x, height, width);
-        image[offset] = buffer[origOffset];
-      }
-    }
-  }
-}
-
-void DataLoader::load_entire_dataset(Task const *task,
-                                     std::vector<PhysicalRegion> const &regions,
-                                     Context ctx,
-                                     Runtime *runtime) {
-  AlexNetConfig const *alexnet = (AlexNetConfig *)task->args;
-  assert(regions.size() == 2);
-  assert(task->regions.size() == regions.size());
-  const FlexFlow::AccessorWO<float, 5> acc_input(regions[0], FID_DATA);
-  const FlexFlow::AccessorWO<int, 3> acc_label(regions[1], FID_DATA);
-  Rect<5> rect_input = runtime->get_index_space_domain(
-      ctx, task->regions[0].region.get_index_space());
-  assert(acc_input.accessor.is_dense_arbitrary(rect_input));
-  Rect<3> rect_label = runtime->get_index_space_domain(
-      ctx, task->regions[1].region.get_index_space());
-  assert(acc_label.accessor.is_dense_arbitrary(rect_label));
-  float *input_ptr = acc_input.ptr(rect_input.lo);
-  int *label_ptr = acc_label.ptr(rect_label.lo);
-  int num_samples = rect_label.hi[1] - rect_label.lo[1] + 1;
-  assert(rect_input.hi[3] - rect_input.lo[3] + 1 == num_samples);
-  if (std::strlen(alexnet->dataset_path) == 0) {
-    log_app.print("Start generating random input samples");
-    for (size_t i = 0; i < rect_label.volume(); i++) {
-      label_ptr[i] = std::rand() % 10;
-    }
-    return;
-  }
-  log_app.print(
-      "Start loading %d samples from %s\n", num_samples, alexnet->dataset_path);
-  int height = rect_input.hi[1] - rect_input.lo[1] + 1;
-  int width = rect_input.hi[0] - rect_input.lo[0] + 1;
-  int origHeight = 32;
-  int origWidth = 32;
-  float heightScale = static_cast<float>(origHeight) / height;
-  float widthScale = static_cast<float>(origWidth) / width;
-  FILE *file = fopen(alexnet->dataset_path, "rb");
-  unsigned char *buffer = (unsigned char *)malloc(3073);
-  unsigned char *image = (unsigned char *)malloc(3 * height * width);
-  for (off_t i = 0; i < num_samples; i++) {
-    size_t ret = fread(buffer, sizeof(unsigned char), 3073, file);
-    assert(ret = 3073);
-    if ((i + 1) % 1000 == 0) {
-      log_app.print("Loaded %ld samples", i + 1);
-    }
-    label_ptr[i] = buffer[0];
-    nearest_neigh(image,
-                  buffer + 1,
-                  height,
-                  width,
-                  origHeight,
-                  origWidth,
-                  heightScale,
-                  widthScale);
-    off_t input_offset = i * 3 * height * width;
-    off_t image_offset = 0;
-    for (off_t h = 0; h < 3 * height * width; h++) {
-      input_ptr[input_offset++] =
-          static_cast<float>(image[image_offset++]) / 255;
-    }
-  }
-  log_app.print("Finish loading %d samples from %s\n",
-                num_samples,
-                alexnet->dataset_path);
-  fclose(file);
-}
-
-void DataLoader::next_batch(FFModel &ff) {
-  Context ctx = ff.config.lg_ctx;
-  Runtime *runtime = ff.config.lg_hlr;
-  // Load input
-  {
-    Domain domain =
-        runtime->get_index_space_domain(ctx, batch_input->parallel_is);
-    ArgumentMap argmap;
-    int idx = next_index;
-    for (Domain::DomainPointIterator it(domain); it; it++) {
-      SampleIdxs meta;
-      assert(ff.config.batchSize % batch_input->dims[3].size == 0);
-      meta.num_samples = batch_input->dims[3].size;
-      for (int i = 0; i < meta.num_samples; i++) {
-        meta.idxs[i] = idx++;
-      }
-      argmap.set_point(*it, TaskArgument(&meta, sizeof(SampleIdxs)));
-    }
-    IndexLauncher launcher(FlexFlow::CUSTOM_GPU_TASK_ID_1,
-                           batch_input->parallel_is,
-                           TaskArgument(NULL, 0),
-                           argmap,
-                           Predicate::TRUE_PRED,
-                           false /*must*/,
-                           0 /*mapper_id*/,
-                           batch_input->machine_view.hash());
-    launcher.add_region_requirement(RegionRequirement(full_input->region,
-                                                      0 /*projection id*/,
-                                                      READ_ONLY,
-                                                      EXCLUSIVE,
-                                                      full_input->region,
-                                                      MAP_TO_ZC_MEMORY));
-    launcher.add_field(0, FID_DATA);
-    launcher.add_region_requirement(RegionRequirement(batch_input->part,
-                                                      0 /*projection id*/,
-                                                      WRITE_ONLY,
-                                                      EXCLUSIVE,
-                                                      batch_input->region));
-    launcher.add_field(1, FID_DATA);
-    runtime->execute_index_space(ctx, launcher);
-  }
-  // Load label
-  {
-    Domain domain =
-        runtime->get_index_space_domain(ctx, batch_label->parallel_is);
-    ArgumentMap argmap;
-    int idx = next_index;
-    for (Domain::DomainPointIterator it(domain); it; it++) {
-      SampleIdxs meta;
-      assert(ff.config.batchSize % batch_label->dims[1].size == 0);
-      meta.num_samples = batch_label->dims[1].size;
-      for (int i = 0; i < meta.num_samples; i++) {
-        meta.idxs[i] = idx++;
-      }
-      argmap.set_point(*it, TaskArgument(&meta, sizeof(SampleIdxs)));
-    }
-    IndexLauncher launcher(FlexFlow::CUSTOM_GPU_TASK_ID_2,
-                           batch_label->parallel_is,
-                           TaskArgument(NULL, 0),
-                           argmap,
-                           Predicate::TRUE_PRED,
-                           false /*must*/,
-                           0 /*mapper_id*/,
-                           batch_label->machine_view.hash());
-    launcher.add_region_requirement(RegionRequirement(full_label->region,
-                                                      0 /*projection id*/,
-                                                      READ_ONLY,
-                                                      EXCLUSIVE,
-                                                      full_label->region,
-                                                      MAP_TO_ZC_MEMORY));
-    launcher.add_field(0, FID_DATA);
-    launcher.add_region_requirement(RegionRequirement(batch_label->part,
-                                                      0 /*projection id*/,
-                                                      WRITE_ONLY,
-                                                      EXCLUSIVE,
-                                                      batch_label->region));
-    launcher.add_field(1, FID_DATA);
-    runtime->execute_index_space(ctx, launcher);
-  }
-  next_index += ff.config.batchSize;
-}
-
-void DataLoader::reset() {
-  next_index = 0;
-}
-
-void FlexFlow::register_custom_tasks() {
-  // Load entire dataset
-  {
-    TaskVariantRegistrar registrar(FlexFlow::CUSTOM_CPU_TASK_ID_1,
-                                   "Load Entire Dataset");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<DataLoader::load_entire_dataset>(
-        registrar, "Load Entire Dataset Task");
-  }
-  // Load input
-  {
-    TaskVariantRegistrar registrar(FlexFlow::CUSTOM_GPU_TASK_ID_1,
-                                   "Load Inputs");
-    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<DataLoader::load_input>(
-        registrar, "Load Input Task");
-  }
-  // Load label
-  {
-    TaskVariantRegistrar registrar(FlexFlow::CUSTOM_GPU_TASK_ID_2,
-                                   "Load Labels");
-    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<DataLoader::load_label>(
-        registrar, "Load Label Task");
-  }
-}
diff --git a/examples/cpp/AlexNet/alexnet.cu b/examples/cpp/AlexNet/alexnet.cu
deleted file mode 100644
index 0be2396863..0000000000
--- a/examples/cpp/AlexNet/alexnet.cu
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "alexnet.h"
-#include "flexflow/utils/cuda_helper.h"
-
-void DataLoader::load_input(Task const *task,
-                            std::vector<PhysicalRegion> const &regions,
-                            Context ctx,
-                            Runtime *runtime) {
-  assert(regions.size() == 2);
-  assert(task->regions.size() == 2);
-  SampleIdxs *meta = (SampleIdxs *)task->local_args;
-  TensorAccessorR<float, 5> acc_full_input(
-      regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  TensorAccessorW<float, 5> acc_batch_input(regions[1],
-                                            task->regions[1],
-                                            FID_DATA,
-                                            ctx,
-                                            runtime,
-                                            false /*readOutput*/);
-  coord_t batch_size =
-      acc_batch_input.rect.hi[3] - acc_batch_input.rect.lo[3] + 1;
-  coord_t channels =
-      acc_batch_input.rect.hi[2] - acc_batch_input.rect.lo[2] + 1;
-  coord_t height = acc_batch_input.rect.hi[1] - acc_batch_input.rect.lo[1] + 1;
-  coord_t width = acc_batch_input.rect.hi[0] - acc_batch_input.rect.lo[0] + 1;
-  // FIXME: currently assume continous indices
-  assert(batch_size == meta->num_samples);
-  for (int i = 1; i < batch_size; i++) {
-    assert(meta->idxs[i] == meta->idxs[0] + i);
-  }
-  coord_t start_idx = meta->idxs[0];
-  float const *input_zc =
-      acc_full_input.ptr + start_idx * channels * height * width;
-  copy_kernel<<<GET_BLOCKS(acc_batch_input.rect.volume()), CUDA_NUM_THREADS>>>(
-      acc_batch_input.ptr, input_zc, acc_batch_input.rect.volume());
-  checkCUDA(cudaDeviceSynchronize());
-}
-
-void DataLoader::load_label(Task const *task,
-                            std::vector<PhysicalRegion> const &regions,
-                            Context ctx,
-                            Runtime *runtime) {
-  assert(regions.size() == 2);
-  assert(task->regions.size() == 2);
-  SampleIdxs *meta = (SampleIdxs *)task->local_args;
-  TensorAccessorR<int, 3> acc_full_label(
-      regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  TensorAccessorW<int, 3> acc_batch_label(regions[1],
-                                          task->regions[1],
-                                          FID_DATA,
-                                          ctx,
-                                          runtime,
-                                          false /*readOutput*/);
-  int batch_size = acc_batch_label.rect.hi[1] - acc_batch_label.rect.lo[1] + 1;
-  // FIXME: currently assume continous indices
-  assert(batch_size == meta->num_samples);
-  for (int i = 1; i < batch_size; i++) {
-    assert(meta->idxs[i] == meta->idxs[0] + i);
-  }
-  int const *input_zc = acc_full_label.ptr + meta->idxs[0];
-  copy_kernel<<<GET_BLOCKS(acc_batch_label.rect.volume()), CUDA_NUM_THREADS>>>(
-      acc_batch_label.ptr, input_zc, acc_batch_label.rect.volume());
-  checkCUDA(cudaDeviceSynchronize());
-}
diff --git a/examples/cpp/AlexNet/alexnet.h b/examples/cpp/AlexNet/alexnet.h
deleted file mode 100644
index b41726afd8..0000000000
--- a/examples/cpp/AlexNet/alexnet.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "flexflow/model.h"
-#define MAX_NUM_SAMPLES 4196
-
-using namespace Legion;
-using namespace std;
-using FlexFlow::FID_DATA;
-using FlexFlow::TensorAccessorR;
-using FlexFlow::TensorAccessorW;
-
-struct AlexNetConfig {
-  AlexNetConfig(void) {
-    // Set default configurations here
-    std::memset(dataset_path, 0, MAX_FILE_LENGTH);
-  }
-  char dataset_path[MAX_FILE_LENGTH];
-};
-
-class DataLoader {
-public:
-  DataLoader(FlexFlow::FFModel &ff,
-             AlexNetConfig const *alexnet,
-             FlexFlow::ParallelTensor _input,
-             FlexFlow::ParallelTensor _label);
-  static void load_input(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime);
-  static void load_label(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime);
-  static void load_entire_dataset(Task const *task,
-                                  std::vector<PhysicalRegion> const &regions,
-                                  Context ctx,
-                                  Runtime *runtime);
-  void next_batch(FlexFlow::FFModel &);
-  void reset(void);
-
-public:
-  int num_samples, next_index;
-  FlexFlow::ParallelTensor full_input, batch_input;
-  FlexFlow::ParallelTensor full_label, batch_label;
-};
-
-struct SampleIdxs {
-  int num_samples;
-  int idxs[MAX_NUM_SAMPLES];
-};
diff --git a/examples/cpp/DLRM/CMakeLists.txt b/examples/cpp/DLRM/CMakeLists.txt
deleted file mode 100644
index bbf1d97a86..0000000000
--- a/examples/cpp/DLRM/CMakeLists.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-cmake_minimum_required(VERSION 3.10)
-
-project(FlexFlowExample_DLRM)
-set(project_target dlrm)
-find_package(HDF5 REQUIRED COMPONENTS C)
-set(CPU_SRC
-  ${FLEXFLOW_CPP_DRV_SRC}
-  dlrm.cc
-  dlrm.h)
-
-set(GPU_SRC
-  dlrm.cu)
-
-cuda_add_executable(${project_target} ${CPU_SRC} ${GPU_SRC})
-target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR} ${HDF5_C_INCLUDE_DIRS})
-target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES} ${HDF5_C_LIBRARIES})
-
-set(BIN_DEST "bin")
-install(TARGETS ${project_target} DESTINATION ${BIN_DEST})
diff --git a/examples/cpp/DLRM/Makefile b/examples/cpp/DLRM/Makefile
deleted file mode 100644
index 292d70293e..0000000000
--- a/examples/cpp/DLRM/Makefile
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Flags for directing the runtime makefile what to include
-DEBUG           ?= 0		# Include debugging symbols
-MAX_DIM         ?= 5		# Maximum number of dimensions
-OUTPUT_LEVEL    ?= LEVEL_DEBUG	# Compile time logging level
-USE_CUDA        ?= 1		# Include CUDA support (requires CUDA)
-USE_GASNET      ?= 1		# Include GASNet support (requires GASNet)
-USE_HDF         ?= 1		# Include HDF5 support (requires HDF5)
-ALT_MAPPERS     ?= 0		# Include alternative mappers (not recommended)
-
-# Put the binary file name here
-OUTFILE		?= dlrm
-# List all the application source files here
-GEN_SRC		= dlrm.cc
-GEN_GPU_SRC	= dlrm.cu
-
-ifndef FF_HOME
-$(error FF_HOME variable is not defined, aborting build)
-endif
-
-include $(FF_HOME)/FlexFlow.mk
diff --git a/examples/cpp/DLRM/Makefile.devgpu b/examples/cpp/DLRM/Makefile.devgpu
deleted file mode 100644
index 594dacb0da..0000000000
--- a/examples/cpp/DLRM/Makefile.devgpu
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-ifndef LG_RT_DIR
-#$(error LG_RT_DIR variable is not defined, aborting build)
-LG_RT_DIR	?= ../../legion/runtime
-endif
-
-ifndef FF_HOME
-$(error FF_HOME variable is not defined, aborting build)
-endif
-
-
-# Flags for directing the runtime makefile what to include
-DEBUG           ?= 0		# Include debugging symbols
-MAX_DIM         ?= 4		# Maximum number of dimensions
-OUTPUT_LEVEL    ?= LEVEL_DEBUG	# Compile time logging level
-USE_CUDA        ?= 1		# Include CUDA support (requires CUDA)
-USE_GASNET      ?= 0		# Include GASNet support (requires GASNet)
-USE_HDF         ?= 1		# Include HDF5 support (requires HDF5)
-ALT_MAPPERS     ?= 0		# Include alternative mappers (not recommended)
-
-# Put the binary file name here
-OUTFILE		?= $(app)
-# List all the application source files here
-GEN_SRC ?= ../../src/runtime/model.cc ../../src/mapper/mapper.cc ../../src/runtime/initializer.cc ../../src/runtime/optimizer.cc\
-        ../../src/ops/embedding_avx2.cc ../../src/ops/embedding.cc  ../../src/runtime/strategy.pb.cc ../../src/runtime/strategy.cc $(app).cc
-GEN_GPU_SRC	?= ../../src/ops/conv_2d.cu ../../src/runtime/model.cu ../../src/ops/pool_2d.cu ../../src/ops/batch_norm.cu ../../src/ops/linear.cu  \
-		../../src/ops/softmax.cu ../../src/ops/concat.cu ../../src/ops/flat.cu ../../src/ops/embedding.cu ../../src/ops/mse_loss.cu\
-		../../src/runtime/initializer_kernel.cu ../../src/runtime/optimizer_kernel.cu ../../src/runtime/accessor_kernel.cu\
-		../../src/ops/batch_matmul.cu ../../src/runtime/cuda_helper.cu $(app).cu# .cu files
-
-# You can modify these variables, some will be appended to by the runtime makefile
-INC_FLAGS	?= -I../../include/ -I${FF_HOME}/protobuf/src -I/mnt/homedir/zhihao/tools/hdf5-1.10.5-linux-centos7-x86_64-shared/include/
-CC_FLAGS	?=
-NVCC_FLAGS	?=
-GASNET_FLAGS	?=
-LD_FLAGS	?= -lcudnn -lcublas -lcurand -lprotobuf -L/usr/local/lib -lhdf5 -L${FF_HOME}/protobuf/src/.libs -L/mnt/homedir/zhihao/tools/hdf5-1.10.5-linux-centos7-x86_64-shared/lib
-# For Point and Rect typedefs
-CC_FLAGS	+= -std=c++11
-NVCC_FLAGS  += -std=c++11
-###########################################################################
-#
-#   Don't change anything below here
-#
-###########################################################################
-
-include $(LG_RT_DIR)/runtime.mk
-
diff --git a/examples/cpp/DLRM/Makefile.ec2 b/examples/cpp/DLRM/Makefile.ec2
deleted file mode 100644
index 102f6f7282..0000000000
--- a/examples/cpp/DLRM/Makefile.ec2
+++ /dev/null
@@ -1,83 +0,0 @@
-# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-
-# Flags for directing the runtime makefile what to include
-DEBUG           ?= 1		# Include debugging symbols
-MAX_DIM         ?= 4		# Maximum number of dimensions
-OUTPUT_LEVEL    ?= LEVEL_DEBUG	# Compile time logging level
-USE_CUDA        ?= 1		# Include CUDA support (requires CUDA)
-USE_GASNET      ?= 1		# Include GASNet support (requires GASNet)
-USE_HDF         ?= 1		# Include HDF5 support (requires HDF5)
-ALT_MAPPERS     ?= 0		# Include alternative mappers (not recommended)
-
-# Put the binary file name here
-OUTFILE		?= $(app)
-# List all the application source files here
-GEN_SRC		?= ../../src/runtime/model.cc ../../src/mapper/mapper.cc ../../src/runtime/initializer.cc ../../src/runtime/optimizer.cc\
-		../../src/ops/embedding.cc ../../src/runtime/strategy.pb.cc ../../src/runtime/strategy.cc $(app).cc
-GEN_GPU_SRC	?= ../../src/ops/conv_2d.cu ../../src/runtime/model.cu ../../src/ops/pool_2d.cu ../../src/ops/batch_norm.cu ../../src/ops/linear.cu  \
-		../../src/ops/softmax.cu ../../src/ops/concat.cu ../../src/ops/flat.cu ../../src/ops/embedding.cu ../../src/ops/mse_loss.cu\
-		../../src/runtime/initializer_kernel.cu ../../src/runtime/optimizer_kernel.cu ../../src/runtime/accessor_kernel.cu\
-		../../src/runtime/cuda_helper.cu $(app).cu# .cu files
-
-# You can modify these variables, some will be appended to by the runtime makefile
-INC_FLAGS	?= -I../../include/ -I${CUDNN}/include  #-I/mnt/homedir/zhihao/tools/protobuf/src -I/mnt/homedir/zhihao/tools/hdf5-1.10.5-linux-centos7-x86_64-shared/include/
-CC_FLAGS	?=
-NVCC_FLAGS	?=
-GASNET_FLAGS	?=
-LD_FLAGS	?= -lcudnn -lcublas -lcurand -lprotobuf -L/usr/local/lib -lhdf5  -L${CUDNN}/lib64  #-L/mnt/homedir/zhihao/tools/protobuf/src/.libs -L/mnt/homedir/zhihao/tools/hdf5-1.10.5-linux-centos7-x86_64-shared/lib
- For Point and Rect typedefs
-CC_FLAGS	+= -std=c++11
-NVCC_FLAGS  	+= -std=c++11
-
-ifndef CUDA
-#$(error CUDA variable is not defined, aborting build)
-endif
-
-ifndef CUDNN
-#$(error CUDNN variable is not defined, aborting build)
-endif
-
-ifndef LG_RT_DIR
-#$(error LG_RT_DIR variable is not defined, aborting build)
-LG_RT_DIR	?= ../../legion/runtime
-endif
-
-ifndef GASNET
-GASNET	?= ../../GASNet-2019.9.0 
-endif
-
-ifndef PROTOBUF
-#$(error PROTOBUF variable is not defined, aborting build)
-endif
-PROTOBUF	?= ../../protobuf
-INC_FLAGS	+= -I${PROTOBUF}/src
-LD_FLAGS	+= -L${PROTOBUF}/src/.lib -lprotobuf
-
-ifndef HDF5
-HDF5_inc	?= /usr/include/hdf5/serial
-HDF5_lib	?= /usr/lib/x86_64-linux-gnu/hdf5/serial
-INC_FLAGS	+= -I${HDF5}/
-LD_FLAGS	+= -L${HDF5_lib} -lhdf5
-endif
-###########################################################################
-#
-#   Don't change anything below here
-#   
-###########################################################################
-
-include $(LG_RT_DIR)/runtime.mk
-
diff --git a/examples/cpp/DLRM/dlrm.cc b/examples/cpp/DLRM/dlrm.cc
deleted file mode 100644
index 41bcd79093..0000000000
--- a/examples/cpp/DLRM/dlrm.cc
+++ /dev/null
@@ -1,750 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "dlrm.h"
-#include "hdf5.h"
-#include <sstream>
-
-using namespace Legion;
-
-LegionRuntime::Logger::Category log_app("DLRM");
-
-void parse_input_args(char **argv, int argc, DLRMConfig &apConfig);
-
-DLRMConfig::DLRMConfig(void)
-    : sparse_feature_size(64), sigmoid_bot(-1), sigmoid_top(-1),
-      embedding_bag_size(1), loss_threshold(0.0f), arch_interaction_op("cat"),
-      dataset_path(""), data_size(-1) {
-  embedding_size.push_back(1000000);
-  embedding_size.push_back(1000000);
-  embedding_size.push_back(1000000);
-  embedding_size.push_back(1000000);
-  // embedding_size.push_back(4);
-  // embedding_size.push_back(4);
-  mlp_bot.push_back(4);
-  mlp_bot.push_back(64);
-  mlp_bot.push_back(64);
-  mlp_top.push_back(64);
-  mlp_top.push_back(64);
-  mlp_top.push_back(2);
-}
-
-Tensor create_mlp(FFModel *model,
-                  Tensor const &input,
-                  std::vector<int> ln,
-                  int sigmoid_layer) {
-  Tensor t = input;
-  for (int i = 0; i < (int)(ln.size() - 1); i++) {
-    float std_dev = sqrt(2.0f / (ln[i + 1] + ln[i]));
-    Initializer *weight_init = new NormInitializer(std::rand(), 0, std_dev);
-    std_dev = sqrt(2.0f / ln[i + 1]);
-    Initializer *bias_init = new NormInitializer(std::rand(), 0, std_dev);
-    ActiMode activation = i == sigmoid_layer ? AC_MODE_SIGMOID : AC_MODE_RELU;
-    t = model->dense(t,
-                     ln[i + 1],
-                     activation,
-                     false /*bias*/,
-                     DT_FLOAT,
-                     NULL /*weight_sharing*/,
-                     weight_init,
-                     bias_init);
-  }
-  return t;
-}
-
-Tensor create_emb(FFModel *model,
-                  Tensor const &input,
-                  int input_dim,
-                  int output_dim,
-                  int idx) {
-  float range = sqrt(1.0f / input_dim);
-  Initializer *embed_init = new UniformInitializer(std::rand(), -range, range);
-  Tensor t = model->embedding(input,
-                              input_dim,
-                              output_dim,
-                              AGGR_MODE_SUM,
-                              DT_HALF /*dtype*/,
-                              NULL /*weight_sharing*/,
-                              embed_init);
-  return model->cast(t, DT_FLOAT);
-}
-
-Tensor interact_features(FFModel *model,
-                         Tensor const &x,
-                         std::vector<Tensor> const &ly,
-                         std::string interaction) {
-  // Currently only support cat
-  // TODO: implement dot attention
-  if (interaction == "cat") {
-    Tensor *inputs = (Tensor *)malloc(sizeof(Tensor) * (1 + ly.size()));
-    inputs[0] = x;
-    for (size_t i = 0; i < ly.size(); i++) {
-      inputs[i + 1] = ly[i];
-    }
-    return model->concat(ly.size() + 1, inputs, -1 /*axis*/);
-    free(inputs);
-  } else {
-    assert(false);
-  }
-}
-
-void print_vector(std::string const &name, std::vector<int> const &vector) {
-  std::ostringstream out;
-  for (size_t i = 0; i < vector.size() - 1; i++) {
-    out << vector[i] << " ";
-  }
-  if (vector.size() > 0) {
-    out << vector[vector.size() - 1];
-  }
-  log_app.print("%s: %s", name.c_str(), out.str().c_str());
-}
-
-void FlexFlow::top_level_task(Task const *task,
-                              std::vector<PhysicalRegion> const &regions,
-                              Context ctx,
-                              Runtime *runtime) {
-  FFConfig ffConfig;
-  // Parse input arguments
-  DLRMConfig dlrmConfig;
-  {
-    InputArgs const &command_args = HighLevelRuntime::get_input_args();
-    char **argv = command_args.argv;
-    int argc = command_args.argc;
-    parse_input_args(argv, argc, dlrmConfig);
-    log_app.print("batchSize(%d) workersPerNodes(%d) numNodes(%d)",
-                  ffConfig.batchSize,
-                  ffConfig.workersPerNode,
-                  ffConfig.numNodes);
-    log_app.print("EmbeddingBagSize(%d)", dlrmConfig.embedding_bag_size);
-    print_vector("Embedding Vocab Sizes", dlrmConfig.embedding_size);
-    print_vector("MLP Top", dlrmConfig.mlp_top);
-    print_vector("MLP Bot", dlrmConfig.mlp_bot);
-  }
-
-  FFModel ff(ffConfig);
-
-  std::vector<Tensor> sparse_inputs;
-  for (size_t i = 0; i < dlrmConfig.embedding_size.size(); i++) {
-    int const dims[] = {ffConfig.batchSize, dlrmConfig.embedding_bag_size};
-    Tensor input = ff.create_tensor<2>(dims, DT_INT64);
-    sparse_inputs.push_back(input);
-  }
-  Tensor dense_input;
-  {
-    int const dims[] = {ffConfig.batchSize, dlrmConfig.mlp_bot[0]};
-    dense_input = ff.create_tensor<2>(dims, DT_FLOAT);
-  }
-  // Tensor label;
-  //{
-  //   const int dims[] = {ffConfig.batchSize, 1};
-  //   label = ff.create_tensor<2>(dims, DT_FLOAT);
-  // }
-  //  Step 1 create dense_mlp
-  Tensor x =
-      create_mlp(&ff, dense_input, dlrmConfig.mlp_bot, dlrmConfig.sigmoid_bot);
-  std::vector<Tensor> ly;
-  for (size_t i = 0; i < dlrmConfig.embedding_size.size(); i++) {
-    int input_dim = dlrmConfig.embedding_size[i];
-    int output_dim = dlrmConfig.sparse_feature_size;
-    ly.push_back(create_emb(&ff, sparse_inputs[i], input_dim, output_dim, i));
-  }
-  Tensor z = interact_features(&ff, x, ly, dlrmConfig.arch_interaction_op);
-  Tensor p =
-      create_mlp(&ff, z, dlrmConfig.mlp_top, dlrmConfig.mlp_top.size() - 2);
-  if (dlrmConfig.loss_threshold > 0.0f && dlrmConfig.loss_threshold < 1.0f) {
-    // TODO: implement clamp
-    assert(false);
-  }
-  // Use SGD Optimizer
-  Optimizer *optimizer = new SGDOptimizer(&ff, 0.01f);
-  std::vector<MetricsType> metrics;
-  // metrics.push_back(METRICS_ACCURACY);
-  // metrics.push_back(METRICS_MEAN_SQUARED_ERROR);
-  ff.compile(optimizer, LOSS_MEAN_SQUARED_ERROR_AVG_REDUCE, metrics);
-  // Data Loader
-  DataLoader data_loader(
-      ff, dlrmConfig, sparse_inputs, dense_input, ff.label_tensor);
-  ff.init_operators();
-
-  // Warmup iterations
-  for (int iter = 0; iter < 1; iter++) {
-    data_loader.reset();
-    ff.reset_metrics();
-    data_loader.next_batch(ff);
-    ff.forward();
-    ff.zero_gradients();
-    ff.backward();
-    // ff.update();
-  }
-
-  // Start timer
-  {
-    runtime->issue_execution_fence(ctx);
-    TimingLauncher timer(MEASURE_MICRO_SECONDS);
-    Future future = runtime->issue_timing_measurement(ctx, timer);
-    future.get_void_result();
-  }
-  log_app.print("Warmup finished...Start timer...");
-  log_app.print("Num. epochs = %d", ffConfig.epochs);
-  log_app.print("Num. iterations/epoch = %d",
-                data_loader.num_samples / ffConfig.batchSize);
-  printf("parameters.size() = %lu\n", ff.parameters.size());
-  double ts_start = Realm::Clock::current_time_in_microseconds();
-  for (int epoch = 0; epoch < ffConfig.epochs; epoch++) {
-    data_loader.reset();
-    ff.reset_metrics();
-    int iterations = data_loader.num_samples / ffConfig.batchSize;
-    for (int iter = 0; iter < iterations; iter++) {
-      if (dlrmConfig.dataset_path.length() == 0) {
-        // Only load data once for random input
-        // if (iter == 0 && epoch == 0)
-        //  data_loader.next_batch(ff);
-      } else {
-        data_loader.next_batch(ff);
-      }
-      if (epoch > 0) {
-        runtime->begin_trace(ctx, 111 /*trace_id*/);
-      }
-      ff.forward();
-      ff.zero_gradients();
-      ff.backward();
-      // ff.update();
-      if (epoch > 0) {
-        runtime->end_trace(ctx, 111 /*trace_id*/);
-      }
-    }
-  }
-  // End timer
-  {
-    runtime->issue_execution_fence(ctx);
-    TimingLauncher timer(MEASURE_MICRO_SECONDS);
-    Future future = runtime->issue_timing_measurement(ctx, timer);
-    future.get_void_result();
-  }
-  double ts_end = Realm::Clock::current_time_in_microseconds();
-  double run_time = 1e-6 * (ts_end - ts_start);
-  printf("ELAPSED TIME = %.4fs, THROUGHPUT = %.2f samples/s\n",
-         run_time,
-         data_loader.num_samples * ffConfig.epochs / run_time);
-}
-
-void parse_input_args(char **argv, int argc, DLRMConfig &config) {
-  for (int i = 1; i < argc; i++) {
-    if (!strcmp(argv[i], "--arch-sparse-feature-size")) {
-      config.sparse_feature_size = atoi(argv[++i]);
-      continue;
-    }
-    if (!strcmp(argv[i], "--arch-embedding-size")) {
-      std::stringstream ss(std::string(argv[++i]));
-      std::string word;
-      config.embedding_size.clear();
-      while (std::getline(ss, word, '-')) {
-        config.embedding_size.push_back(std::stoi(word));
-      }
-      continue;
-    }
-    if (!strcmp(argv[i], "--embedding-bag-size")) {
-      config.embedding_bag_size = atoi(argv[++i]);
-      continue;
-    }
-    if (!strcmp(argv[i], "--arch-mlp-bot")) {
-      std::stringstream ss(std::string(argv[++i]));
-      std::string word;
-      config.mlp_bot.clear();
-      while (std::getline(ss, word, '-')) {
-        config.mlp_bot.push_back(std::stoi(word));
-      }
-      continue;
-    }
-    if (!strcmp(argv[i], "--arch-mlp-top")) {
-      std::stringstream ss(std::string(argv[++i]));
-      std::string word;
-      config.mlp_top.clear();
-      while (std::getline(ss, word, '-')) {
-        config.mlp_top.push_back(std::stoi(word));
-      }
-      continue;
-    }
-    if (!strcmp(argv[i], "--loss-threshold")) {
-      config.loss_threshold = atof(argv[++i]);
-      continue;
-    }
-    if (!strcmp(argv[i], "--sigmoid-top")) {
-      config.sigmoid_top = atoi(argv[++i]);
-      continue;
-    }
-    if (!strcmp(argv[i], "--sigmoid-bot")) {
-      config.sigmoid_bot = atoi(argv[++i]);
-      continue;
-    }
-    if (!strcmp(argv[i], "--arch-interaction-op")) {
-      config.arch_interaction_op = std::string(argv[++i]);
-      continue;
-    }
-    if (!strcmp(argv[i], "--dataset")) {
-      config.dataset_path = std::string(argv[++i]);
-      continue;
-    }
-    if (!strcmp(argv[i], "--data-size")) {
-      config.data_size = atoi(argv[++i]);
-      continue;
-    }
-  }
-}
-
-DataLoader::DataLoader(FFModel &ff,
-                       DLRMConfig const &dlrm,
-                       std::vector<Tensor> const &_sparse_inputs,
-                       Tensor _dense_input,
-                       Tensor _label) {
-  Context ctx = ff.config.lg_ctx;
-  Runtime *runtime = ff.config.lg_hlr;
-  num_samples = 0;
-  if (dlrm.dataset_path == "") {
-    log_app.print("Use random dataset...");
-    if (dlrm.data_size > 0) {
-      num_samples = dlrm.data_size; // num_samples = 256 * 2 * 8 * 16;
-    } else {
-      num_samples = 256 * 4 * ff.config.workersPerNode * ff.config.numNodes;
-    }
-    // num_samples = 256 * 2 * 8 * 16;
-    log_app.print("Number of random samples = %d\n", num_samples);
-  } else {
-    log_app.print("Start loading dataset from %s", dlrm.dataset_path.c_str());
-    hid_t file_id =
-        H5Fopen(dlrm.dataset_path.c_str(), H5F_ACC_RDWR, H5P_DEFAULT);
-    // X_int
-    {
-      hsize_t dims[2], maxdims[2];
-      hid_t x_int_dataset_id = H5Dopen2(file_id, "X_int", H5P_DEFAULT);
-      hid_t x_int_space_id = H5Dget_space(x_int_dataset_id);
-      hid_t x_int_type_id = H5Dget_type(x_int_dataset_id);
-      assert(H5Sget_simple_extent_dims(x_int_space_id, dims, maxdims) == 2);
-      assert(H5Tget_class(x_int_type_id) == H5T_FLOAT);
-      num_samples = dims[0];
-      assert(dlrm.mlp_bot[0] == (int)dims[1]);
-      H5Tclose(x_int_type_id);
-      H5Dclose(x_int_dataset_id);
-      H5Sclose(x_int_space_id);
-    }
-    // X_cat
-    {
-      hsize_t dims[2], maxdims[2];
-      hid_t x_cat_dataset_id = H5Dopen2(file_id, "X_cat", H5P_DEFAULT);
-      hid_t x_cat_space_id = H5Dget_space(x_cat_dataset_id);
-      hid_t x_cat_type_id = H5Dget_type(x_cat_dataset_id);
-      assert(H5Sget_simple_extent_dims(x_cat_space_id, dims, maxdims) == 2);
-      assert(H5Tget_class(x_cat_type_id) == H5T_INTEGER);
-      assert(num_samples == (int)dims[0]);
-      assert(_sparse_inputs.size() == dims[1]);
-      H5Tclose(x_cat_type_id);
-      H5Dclose(x_cat_dataset_id);
-      H5Sclose(x_cat_space_id);
-    }
-    // y
-    {
-      hsize_t dims[2], maxdims[2];
-      hid_t y_dataset_id = H5Dopen2(file_id, "y", H5P_DEFAULT);
-      hid_t y_space_id = H5Dget_space(y_dataset_id);
-      hid_t y_type_id = H5Dget_type(y_dataset_id);
-      H5Sget_simple_extent_dims(y_space_id, dims, maxdims);
-      assert(num_samples == (int)dims[0]);
-      // assert(dims[1] == 1);
-      H5Tclose(y_type_id);
-      H5Dclose(y_dataset_id);
-      H5Sclose(y_space_id);
-    }
-    H5Fclose(file_id);
-    log_app.print("Finish loading dataset from %s", dlrm.dataset_path.c_str());
-    log_app.print("Loaded %d samples", num_samples);
-  }
-  return;
-  for (size_t i = 0; i < _sparse_inputs.size(); i++) {
-    batch_sparse_inputs.push_back(_sparse_inputs[i]);
-  }
-  {
-    int const dims[] = {num_samples,
-                        (int)_sparse_inputs.size() * dlrm.embedding_bag_size};
-    full_sparse_input = ff.create_tensor<2>(dims, DT_INT64);
-    // ff.map_tensor(full_sparse_input, full_sparse_input->owner_op);
-  }
-  {
-    batch_dense_input = _dense_input;
-    int const dims[] = {num_samples, dlrm.mlp_bot[0]};
-    full_dense_input = ff.create_tensor<2>(dims, DT_FLOAT);
-    // ff.map_tensor(full_dense_input,
-    // full_dense_input->parallel_tensor->owner_op);
-  }
-  {
-    batch_label = _label;
-    int const dims[] = {num_samples, 1};
-    full_label = ff.create_tensor<2>(dims, DT_FLOAT);
-    // ff.map_tensor(full_label, full_label->parallel_tensor->owner_op);
-  }
-  // Load entire dataset
-  // TODO: Use index launcher instead of task launcher
-
-  // passing DLRM Config through plain struct. ->
-  ArgsConfig dlrm_args;
-  assert(dlrm.embedding_size.size() <= MAX_NUM_EMB);
-  assert(dlrm.dataset_path.length() <= MAX_DATASET_PATH_LEN);
-  auto prev_s = dlrm.embedding_size[0];
-  for (auto s : dlrm.embedding_size) {
-    assert(s == prev_s);
-  }
-  dlrm_args.embedding_size = prev_s;
-  strcpy(dlrm_args.dataset_path, dlrm.dataset_path.c_str());
-  //
-  TaskLauncher launcher(CUSTOM_CPU_TASK_ID_1,
-                        TaskArgument(&dlrm_args, sizeof(dlrm_args)));
-  // regions[0]: full_sparse_input
-  launcher.add_region_requirement(
-      RegionRequirement(full_sparse_input->parallel_tensor->region,
-                        WRITE_ONLY,
-                        EXCLUSIVE,
-                        full_sparse_input->parallel_tensor->region,
-                        MAP_TO_ZC_MEMORY));
-  launcher.add_field(0, FID_DATA);
-  // regions[1]: full_dense_input
-  launcher.add_region_requirement(
-      RegionRequirement(full_dense_input->parallel_tensor->region,
-                        WRITE_ONLY,
-                        EXCLUSIVE,
-                        full_dense_input->parallel_tensor->region,
-                        MAP_TO_ZC_MEMORY));
-  launcher.add_field(1, FID_DATA);
-  // regions[3]: full_label
-  launcher.add_region_requirement(
-      RegionRequirement(full_label->parallel_tensor->region,
-                        WRITE_ONLY,
-                        EXCLUSIVE,
-                        full_label->parallel_tensor->region,
-                        MAP_TO_ZC_MEMORY));
-  launcher.add_field(2, FID_DATA);
-  runtime->execute_task(ctx, launcher);
-}
-
-void DataLoader::load_entire_dataset(Task const *task,
-                                     std::vector<PhysicalRegion> const &regions,
-                                     Context ctx,
-                                     Runtime *runtime) {
-  assert(regions.size() == 3);
-  assert(task->regions.size() == 3);
-  // Note that these instances are in ZCM, can only use
-  // TensorAccessorW with readOutput flag
-  AccessorWO<int64_t, 2> const acc_sparse_input(regions[0], FID_DATA);
-  AccessorWO<float, 2> const acc_dense_input(regions[1], FID_DATA);
-  AccessorWO<float, 2> const acc_label_input(regions[2], FID_DATA);
-  Rect<2> rect_sparse_input = runtime->get_index_space_domain(
-      ctx, task->regions[0].region.get_index_space());
-  Rect<2> rect_dense_input = runtime->get_index_space_domain(
-      ctx, task->regions[1].region.get_index_space());
-  Rect<2> rect_label_input = runtime->get_index_space_domain(
-      ctx, task->regions[2].region.get_index_space());
-  assert(acc_sparse_input.accessor.is_dense_arbitrary(rect_sparse_input));
-  assert(acc_dense_input.accessor.is_dense_arbitrary(rect_dense_input));
-  assert(acc_label_input.accessor.is_dense_arbitrary(rect_label_input));
-  int64_t *sparse_input_ptr = acc_sparse_input.ptr(rect_sparse_input.lo);
-  float *dense_input_ptr = acc_dense_input.ptr(rect_dense_input.lo);
-  float *label_input_ptr = acc_label_input.ptr(rect_label_input.lo);
-  int num_samples = rect_sparse_input.hi[1] - rect_sparse_input.lo[1] + 1;
-  int num_sparse_inputs = rect_sparse_input.hi[0] - rect_sparse_input.lo[0] + 1;
-  assert(num_samples == rect_dense_input.hi[1] - rect_dense_input.lo[1] + 1);
-  int num_dense_dims = rect_dense_input.hi[0] - rect_dense_input.lo[0] + 1;
-  assert(num_samples == rect_label_input.hi[1] - rect_label_input.lo[1] + 1);
-  assert(rect_label_input.hi[0] == rect_label_input.lo[0]);
-  const ArgsConfig dlrm = *((ArgsConfig const *)task->args);
-  int const emb_size = dlrm.embedding_size;
-  std::string file_name((char const *)dlrm.dataset_path);
-  if (file_name.length() == 0) {
-    log_app.print("Start generating random input samples");
-    for (size_t i = 0; i < rect_sparse_input.volume(); i++) {
-      sparse_input_ptr[i] = std::rand() % emb_size;
-    }
-    for (size_t i = 0; i < rect_dense_input.volume(); i++) {
-      dense_input_ptr[i] = ((float)std::rand()) / RAND_MAX;
-    }
-    for (size_t i = 0; i < rect_label_input.volume(); i++) {
-      label_input_ptr[i] = std::rand() % 2;
-    }
-  } else {
-    hid_t file_id = H5Fopen(file_name.c_str(), H5F_ACC_RDWR, H5P_DEFAULT);
-    // Load X_cat
-    {
-      log_app.print("Start loading sparse features from "
-                    "%s.%s",
-                    file_name.c_str(),
-                    "X_cat");
-      hsize_t dims[2], maxdims[2];
-      hid_t x_cat_dataset_id = H5Dopen2(file_id, "X_cat", H5P_DEFAULT);
-      hid_t x_cat_space_id = H5Dget_space(x_cat_dataset_id);
-      hid_t x_cat_type_id = H5Dget_type(x_cat_dataset_id);
-      assert(H5Sget_simple_extent_dims(x_cat_space_id, dims, maxdims) == 2);
-      assert(H5Tget_class(x_cat_type_id) == H5T_INTEGER);
-      assert(num_samples == (int)dims[0]);
-      assert(num_sparse_inputs == (int)dims[1]);
-      H5Dread(x_cat_dataset_id,
-              H5T_NATIVE_LLONG,
-              H5S_ALL,
-              H5S_ALL,
-              H5P_DEFAULT,
-              sparse_input_ptr);
-      H5Tclose(x_cat_type_id);
-      H5Dclose(x_cat_dataset_id);
-      H5Sclose(x_cat_space_id);
-      log_app.print("Finish loading sparse features");
-    }
-    // Load X_int
-    {
-      log_app.print("Start loading dense features from "
-                    "%s.%s",
-                    file_name.c_str(),
-                    "X_int");
-      hsize_t dims[2], maxdims[2];
-      hid_t x_int_dataset_id = H5Dopen2(file_id, "X_int", H5P_DEFAULT);
-      hid_t x_int_space_id = H5Dget_space(x_int_dataset_id);
-      hid_t x_int_type_id = H5Dget_type(x_int_dataset_id);
-      assert(H5Sget_simple_extent_dims(x_int_space_id, dims, maxdims) == 2);
-      assert(H5Tget_class(x_int_type_id) == H5T_FLOAT);
-      num_samples = dims[0];
-      assert(num_dense_dims == (int)dims[1]);
-      H5Dread(x_int_dataset_id,
-              H5T_NATIVE_FLOAT,
-              H5S_ALL,
-              H5S_ALL,
-              H5P_DEFAULT,
-              dense_input_ptr);
-      H5Tclose(x_int_type_id);
-      H5Dclose(x_int_dataset_id);
-      H5Sclose(x_int_space_id);
-      log_app.print("Finish loading dense features");
-    }
-    // Load y
-    {
-      log_app.print("Start loading labels from "
-                    "%s.%s",
-                    file_name.c_str(),
-                    "y");
-      hsize_t dims[2], maxdims[2];
-      hid_t y_dataset_id = H5Dopen2(file_id, "y", H5P_DEFAULT);
-      hid_t y_space_id = H5Dget_space(y_dataset_id);
-      hid_t y_type_id = H5Dget_type(y_dataset_id);
-      H5Sget_simple_extent_dims(y_space_id, dims, maxdims);
-      assert(num_samples == (int)dims[0]);
-      // assert(dims[1] == 1);
-      H5Dread(y_dataset_id,
-              H5T_NATIVE_FLOAT,
-              H5S_ALL,
-              H5S_ALL,
-              H5P_DEFAULT,
-              label_input_ptr);
-      H5Tclose(y_type_id);
-      H5Dclose(y_dataset_id);
-      H5Sclose(y_space_id);
-      log_app.print("Finish loading labels");
-    }
-  }
-}
-
-void DataLoader::next_batch(FFModel &ff) {
-  return;
-  Context ctx = ff.config.lg_ctx;
-  Runtime *runtime = ff.config.lg_hlr;
-  // Load Sparse Inputs
-  for (size_t i = 0; i < batch_sparse_inputs.size(); i++) {
-    int hash = batch_sparse_inputs.size() * MAX_NUM_EMB + i;
-    Domain domain = runtime->get_index_space_domain(
-        ctx, batch_sparse_inputs[i]->parallel_tensor->parallel_is);
-    ArgumentMap argmap;
-    int idx = next_index;
-    for (Domain::DomainPointIterator it(domain); it; it++) {
-      SampleIdxs meta;
-      assert(ff.config.batchSize ==
-             batch_sparse_inputs[i]->parallel_tensor->dims[1].size);
-      meta.num_samples =
-          ff.config.batchSize /
-          batch_sparse_inputs[i]->parallel_tensor->dims[1].degree;
-      // Assert that we have enough slots to save the indices
-      assert(meta.num_samples <= MAX_NUM_SAMPLES);
-      for (int i = 0; i < meta.num_samples; i++) {
-        meta.idxs[i] = idx++;
-      }
-      argmap.set_point(*it, TaskArgument(&meta, sizeof(SampleIdxs)));
-    }
-    IndexLauncher launcher(
-        CUSTOM_GPU_TASK_ID_1,
-        batch_sparse_inputs[i]->parallel_tensor->parallel_is,
-        TaskArgument(&hash, sizeof(int)),
-        argmap,
-        Predicate::TRUE_PRED,
-        false /*must*/,
-        0 /*mapper_id*/,
-        batch_sparse_inputs[i]->parallel_tensor->machine_view.hash());
-    // Full dataset in ZCM
-    launcher.add_region_requirement(
-        RegionRequirement(full_sparse_input->parallel_tensor->region,
-                          0 /*projection id*/,
-                          READ_ONLY,
-                          EXCLUSIVE,
-                          full_sparse_input->parallel_tensor->region,
-                          MAP_TO_ZC_MEMORY));
-    launcher.add_field(0, FID_DATA);
-    launcher.add_region_requirement(
-        RegionRequirement(batch_sparse_inputs[i]->parallel_tensor->part,
-                          0 /*projection id*/,
-                          WRITE_ONLY,
-                          EXCLUSIVE,
-                          batch_sparse_inputs[i]->parallel_tensor->region));
-    launcher.add_field(1, FID_DATA);
-    runtime->execute_index_space(ctx, launcher);
-  }
-  // Load Dense Input
-  {
-    Domain domain = runtime->get_index_space_domain(
-        ctx, batch_dense_input->parallel_tensor->parallel_is);
-    ArgumentMap argmap;
-    int idx = next_index;
-    for (Domain::DomainPointIterator it(domain); it; it++) {
-      SampleIdxs meta;
-      assert(ff.config.batchSize ==
-             batch_dense_input->parallel_tensor->dims[1].size);
-      meta.num_samples = ff.config.batchSize /
-                         batch_dense_input->parallel_tensor->dims[1].degree;
-      for (int i = 0; i < meta.num_samples; i++) {
-        meta.idxs[i] = idx++;
-      }
-      argmap.set_point(*it, TaskArgument(&meta, sizeof(SampleIdxs)));
-    }
-    IndexLauncher launcher(
-        CUSTOM_GPU_TASK_ID_2,
-        batch_dense_input->parallel_tensor->parallel_is,
-        TaskArgument(NULL, 0),
-        argmap,
-        Predicate::TRUE_PRED,
-        false /*must*/,
-        0 /*mapper_id*/,
-        batch_dense_input->parallel_tensor->machine_view.hash());
-    // Full dataset in ZCM
-    launcher.add_region_requirement(
-        RegionRequirement(full_dense_input->parallel_tensor->region,
-                          0 /*projection id*/,
-                          READ_ONLY,
-                          EXCLUSIVE,
-                          full_dense_input->parallel_tensor->region,
-                          MAP_TO_ZC_MEMORY));
-    launcher.add_field(0, FID_DATA);
-    launcher.add_region_requirement(
-        RegionRequirement(batch_dense_input->parallel_tensor->part,
-                          0 /*projection id*/,
-                          WRITE_ONLY,
-                          EXCLUSIVE,
-                          batch_dense_input->parallel_tensor->region));
-    launcher.add_field(1, FID_DATA);
-    runtime->execute_index_space(ctx, launcher);
-  }
-  // Load Labels
-  {
-    Domain domain = runtime->get_index_space_domain(
-        ctx, batch_label->parallel_tensor->parallel_is);
-    ArgumentMap argmap;
-    int idx = next_index;
-    for (Domain::DomainPointIterator it(domain); it; it++) {
-      SampleIdxs meta;
-      assert(ff.config.batchSize % batch_label->parallel_tensor->dims[1].size);
-      meta.num_samples =
-          ff.config.batchSize / batch_label->parallel_tensor->dims[1].degree;
-      for (int i = 0; i < meta.num_samples; i++) {
-        meta.idxs[i] = idx++;
-      }
-      argmap.set_point(*it, TaskArgument(&meta, sizeof(SampleIdxs)));
-    }
-    IndexLauncher launcher(CUSTOM_GPU_TASK_ID_3,
-                           batch_label->parallel_tensor->parallel_is,
-                           TaskArgument(NULL, 0),
-                           argmap,
-                           Predicate::TRUE_PRED,
-                           false /*must*/,
-                           0 /*mapper_id*/,
-                           batch_label->parallel_tensor->machine_view.hash());
-    // Full dataset in ZCM
-    launcher.add_region_requirement(
-        RegionRequirement(full_label->parallel_tensor->region,
-                          0 /*projection id*/,
-                          READ_ONLY,
-                          EXCLUSIVE,
-                          full_label->parallel_tensor->region,
-                          MAP_TO_ZC_MEMORY));
-    launcher.add_field(0, FID_DATA);
-    launcher.add_region_requirement(
-        RegionRequirement(batch_label->parallel_tensor->part,
-                          0 /*projection id*/,
-                          WRITE_ONLY,
-                          EXCLUSIVE,
-                          batch_label->parallel_tensor->region));
-    launcher.add_field(1, FID_DATA);
-    runtime->execute_index_space(ctx, launcher);
-  }
-  // progress next_index
-  next_index += ff.config.batchSize;
-}
-
-void DataLoader::shuffle() {}
-
-void DataLoader::reset() {
-  next_index = 0;
-}
-
-void DataLoader::load_sparse_input_cpu(
-    Task const *task,
-    std::vector<PhysicalRegion> const &regions,
-    Context ctx,
-    Runtime *runtime) {
-  std::cout << "load_sparse_input_cpu" << std::endl;
-}
-
-void FlexFlow::register_custom_tasks() {
-  // Load entire dataset
-  {
-    TaskVariantRegistrar registrar(CUSTOM_CPU_TASK_ID_1, "Load Entire Dataset");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<DataLoader::load_entire_dataset>(
-        registrar, "Load Entire Dataset Task");
-  }
-  // Load Sparse Inputs
-  {
-    TaskVariantRegistrar registrar(CUSTOM_GPU_TASK_ID_1, "Load Sparse Inputs");
-    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<DataLoader::load_sparse_input>(
-        registrar, "Load Sparse Inputs Task");
-  }
-  // Load Dense Inputs
-  {
-    TaskVariantRegistrar registrar(CUSTOM_GPU_TASK_ID_2, "Load Dense Inputs");
-    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<DataLoader::load_dense_input>(
-        registrar, "Load Dense Inputs Task");
-  }
-  // Load Labels
-  {
-    TaskVariantRegistrar registrar(CUSTOM_GPU_TASK_ID_3, "Load Labels");
-    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<DataLoader::load_label>(registrar,
-                                                              "Load Labels");
-  }
-}
diff --git a/examples/cpp/DLRM/dlrm.cu b/examples/cpp/DLRM/dlrm.cu
deleted file mode 100644
index 1e090c2d9c..0000000000
--- a/examples/cpp/DLRM/dlrm.cu
+++ /dev/null
@@ -1,139 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "dlrm.h"
-#include "flexflow/utils/cuda_helper.h"
-
-void DataLoader::load_sparse_input(Task const *task,
-                                   std::vector<PhysicalRegion> const &regions,
-                                   Context ctx,
-                                   Runtime *runtime) {
-  assert(regions.size() == 2);
-  assert(task->regions.size() == 2);
-  int hash = *((int *)task->args);
-  int num_sparse_inputs = hash / 1000;
-  int my_input_idx = hash % 1000;
-  SampleIdxs *meta = (SampleIdxs *)task->local_args;
-  TensorAccessorR<int64_t, 2> acc_full_input(
-      regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  TensorAccessorW<int64_t, 3> acc_batch_input(regions[1],
-                                              task->regions[1],
-                                              FID_DATA,
-                                              ctx,
-                                              runtime,
-                                              false /*readOutput*/);
-  int batch_size = acc_batch_input.rect.hi[1] - acc_batch_input.rect.lo[1] + 1;
-  int in_dim = acc_batch_input.rect.hi[0] - acc_batch_input.rect.lo[0] + 1;
-  assert(acc_full_input.rect.hi[0] - acc_full_input.rect.lo[0] + 1 ==
-         num_sparse_inputs * in_dim);
-  int64_t *input_zc;
-  checkCUDA(cudaHostAlloc(&input_zc,
-                          sizeof(int64_t) * acc_batch_input.rect.volume(),
-                          cudaHostAllocPortable | cudaHostAllocMapped));
-  assert(batch_size == meta->num_samples);
-  for (int i = 0; i < batch_size; i++) {
-    int full_offset =
-        meta->idxs[i] * num_sparse_inputs * in_dim + my_input_idx * in_dim;
-    int batch_offset = i * in_dim;
-    assert(full_offset + in_dim <= (int)acc_full_input.rect.volume());
-    for (int j = 0; j < in_dim; j++) {
-      input_zc[batch_offset + j] = acc_full_input.ptr[full_offset + j];
-    }
-  }
-  checkCUDA(cudaMemcpy(acc_batch_input.ptr,
-                       input_zc,
-                       sizeof(int64_t) * acc_batch_input.rect.volume(),
-                       cudaMemcpyHostToDevice));
-  checkCUDA(cudaFreeHost(input_zc));
-  checkCUDA(cudaDeviceSynchronize());
-  // print_tensor<2, int>(acc_batch_input.ptr, acc_batch_input.rect,
-  // "[DataLoader:load_sparse]");
-}
-
-void DataLoader::load_dense_input(Task const *task,
-                                  std::vector<PhysicalRegion> const &regions,
-                                  Context ctx,
-                                  Runtime *runtime) {
-  assert(regions.size() == 2);
-  assert(task->regions.size() == 2);
-  SampleIdxs *meta = (SampleIdxs *)task->local_args;
-  TensorAccessorR<float, 2> acc_full_input(
-      regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  TensorAccessorW<float, 3> acc_batch_input(regions[1],
-                                            task->regions[1],
-                                            FID_DATA,
-                                            ctx,
-                                            runtime,
-                                            false /*readOutput*/);
-  int batch_size = acc_batch_input.rect.hi[1] - acc_batch_input.rect.lo[1] + 1;
-  int num_feats = acc_batch_input.rect.hi[0] - acc_batch_input.rect.lo[0] + 1;
-  assert(acc_batch_input.rect.hi[0] == acc_full_input.rect.hi[0]);
-  assert(acc_batch_input.rect.lo[0] == acc_full_input.rect.lo[0]);
-  float *input_zc;
-  checkCUDA(cudaHostAlloc(&input_zc,
-                          sizeof(float) * acc_batch_input.rect.volume(),
-                          cudaHostAllocPortable | cudaHostAllocMapped));
-  assert(batch_size == meta->num_samples);
-  for (int i = 0; i < batch_size; i++) {
-    int base_offset = meta->idxs[i] * num_feats;
-    for (int j = 0; j < num_feats; j++) {
-      input_zc[i * num_feats + j] = acc_full_input.ptr[base_offset + j];
-    }
-  }
-  checkCUDA(cudaMemcpy(acc_batch_input.ptr,
-                       input_zc,
-                       sizeof(float) * acc_batch_input.rect.volume(),
-                       cudaMemcpyHostToDevice));
-  checkCUDA(cudaFreeHost(input_zc));
-}
-
-void DataLoader::load_label(Task const *task,
-                            std::vector<PhysicalRegion> const &regions,
-                            Context ctx,
-                            Runtime *runtime) {
-  assert(regions.size() == 2);
-  assert(task->regions.size() == 2);
-  SampleIdxs *meta = (SampleIdxs *)task->local_args;
-  TensorAccessorR<float, 2> acc_full_label(
-      regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  TensorAccessorW<float, 3> acc_batch_label(regions[1],
-                                            task->regions[1],
-                                            FID_DATA,
-                                            ctx,
-                                            runtime,
-                                            false /*readOutput*/);
-  int batch_size = acc_batch_label.rect.hi[1] - acc_batch_label.rect.lo[1] + 1;
-  int num_label = acc_batch_label.rect.hi[0] - acc_batch_label.rect.lo[0] + 1;
-  assert(num_label == 1); // Kaggle dataset a has single label
-  assert(acc_batch_label.rect.hi[0] == acc_full_label.rect.hi[0]);
-  assert(acc_batch_label.rect.lo[0] == acc_full_label.rect.lo[0]);
-  float *label_zc;
-  checkCUDA(cudaHostAlloc(&label_zc,
-                          sizeof(float) * acc_batch_label.rect.volume(),
-                          cudaHostAllocPortable | cudaHostAllocMapped));
-  assert(batch_size == meta->num_samples);
-  for (int i = 0; i < batch_size; i++) {
-    int base_offset = meta->idxs[i] * num_label;
-    for (int j = 0; j < num_label; j++) {
-      label_zc[i * num_label + j] = acc_full_label.ptr[base_offset + j];
-    }
-    // printf("meta->idxs[%d]=%d label=%.2lf\n", i, meta->idxs[i], label_zc[i]);
-  }
-  checkCUDA(cudaMemcpy(acc_batch_label.ptr,
-                       label_zc,
-                       sizeof(float) * acc_batch_label.rect.volume(),
-                       cudaMemcpyHostToDevice));
-  checkCUDA(cudaFreeHost(label_zc));
-}
diff --git a/examples/cpp/DLRM/dlrm.h b/examples/cpp/DLRM/dlrm.h
deleted file mode 100644
index e80fd9abe8..0000000000
--- a/examples/cpp/DLRM/dlrm.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "flexflow/model.h"
-#define MAX_NUM_SAMPLES 65536
-#define MAX_NUM_EMB 1000
-#define MAX_NUM_MLPS 100
-#define MAX_DATASET_PATH_LEN 1023
-
-using namespace Legion;
-using namespace FlexFlow;
-
-struct DLRMConfig {
-  DLRMConfig(void);
-  int sparse_feature_size, sigmoid_bot, sigmoid_top, embedding_bag_size;
-  float loss_threshold;
-  std::vector<int> embedding_size, mlp_bot, mlp_top;
-  std::string arch_interaction_op, dataset_path;
-  int data_size;
-};
-
-struct ArgsConfig {
-  int sparse_feature_size, sigmoid_bot, sigmoid_top, embedding_bag_size;
-  int embedding_size, mlp_bot[MAX_NUM_MLPS], mlp_top[MAX_NUM_MLPS];
-  char dataset_path[MAX_DATASET_PATH_LEN];
-};
-
-class DataLoader {
-public:
-  DataLoader(FFModel &ff,
-             DLRMConfig const &dlrm,
-             std::vector<Tensor> const &_sparse_inputs,
-             Tensor _dense_input,
-             Tensor _label);
-
-  void next_batch(FFModel &ff);
-  void shuffle();
-  void reset();
-  static void load_entire_dataset(Task const *task,
-                                  std::vector<PhysicalRegion> const &regions,
-                                  Context ctx,
-                                  Runtime *runtime);
-  static void load_sparse_input(Task const *task,
-                                std::vector<PhysicalRegion> const &regions,
-                                Context ctx,
-                                Runtime *runtime);
-  static void load_sparse_input_cpu(Task const *task,
-                                    std::vector<PhysicalRegion> const &regions,
-                                    Context ctx,
-                                    Runtime *runtime);
-  static void load_dense_input(Task const *task,
-                               std::vector<PhysicalRegion> const &regions,
-                               Context ctx,
-                               Runtime *runtime);
-  static void load_label(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime);
-
-public:
-  int num_samples, next_index;
-
-private:
-  std::vector<Tensor> batch_sparse_inputs;
-  Tensor full_sparse_input, full_dense_input, batch_dense_input, full_label,
-      batch_label;
-};
-
-struct SampleIdxs {
-  int num_samples;
-  int idxs[MAX_NUM_SAMPLES];
-};
diff --git a/examples/cpp/DLRM/preprocess_hdf.py b/examples/cpp/DLRM/preprocess_hdf.py
deleted file mode 100644
index a2c43560bf..0000000000
--- a/examples/cpp/DLRM/preprocess_hdf.py
+++ /dev/null
@@ -1,24 +0,0 @@
-import h5py
-import numpy as np
-import argparse
-
-parser = argparse.ArgumentParser()
-parser.add_argument("-i", "--input", help="Path to input numpy file", required=True)
-parser.add_argument("-o", "--output", help="Path to output HDF file", required=True)
-
-args = parser.parse_args()
-
-file = np.load(args.input)
-hdf = h5py.File(args.output, 'w')
-
-X_cat = file['X_cat']
-X_cat = X_cat.astype(np.long)
-hdf.create_dataset("X_cat", data=X_cat)
-
-X_int = file['X_int']
-X_int = np.log(X_int.astype(np.float32) + 1)
-hdf.create_dataset("X_int", data=X_int)
-
-y = file['y']
-y = y.astype(np.float32)
-hdf.create_dataset("y", data=y)
diff --git a/examples/cpp/DLRM/run_criteo_kaggle.sh b/examples/cpp/DLRM/run_criteo_kaggle.sh
deleted file mode 100755
index 478d33810e..0000000000
--- a/examples/cpp/DLRM/run_criteo_kaggle.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/bash
-
-per_gpu_batch_size=256
-numgpu="$1"
-batchsize=$((numgpu * per_gpu_batch_size))
-dataset="$2"
-
-./dlrm -ll:gpu "${numgpu}" -ll:cpu 1 -ll:fsize 12000 -ll:zsize 20000 --arch-sparse-feature-size 16 --arch-embedding-size 1396-550-1761917-507795-290-21-11948-608-3-58176-5237-1497287-3127-26-12153-1068715-10-4836-2085-4-1312273-17-15-110946-91-72655 --arch-mlp-bot 13-512-256-64-16 --arch-mlp-top 224-512-256-1 --dataset "${dataset}" --epochs 100 --batch-size "${batchsize}"
diff --git a/examples/cpp/DLRM/run_dlrm_kaggle_day1.sh b/examples/cpp/DLRM/run_dlrm_kaggle_day1.sh
deleted file mode 100755
index 110b478c93..0000000000
--- a/examples/cpp/DLRM/run_dlrm_kaggle_day1.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/bin/bash
-
-./examples/dlrm -ll:gpu 1 -ll:cpu 4 -ll:fsize 12000 -ll:zsize 10000 --arch-sparse-feature-sie 13 --arch-embedding-size 1396-550-1761917-507795-290-21-11948-608-3-58176-5237-1497287-3127-26-12153-1068715-10-4836-2085-4-1312273-17-15-110946-91-72655 --arch-mlp-bot 13-512-256-64-16 --arch-mlp-top 512-256-1 --dataset /home/ubuntu/kaggle_dataset/kaggle_day_1.h5 --epochs 100 --batch-size 128
-
-#./examples/dlrm -ll:gpu 1 -ll:fsize 12000 -ll:zsize 10000 --arch-sparse-feature-sie 13 --arch-embedding-size 1461-584-10131227-2202608-306-24-12518-634-4-93146-5684-8351593-3195-28-14993-5461306-11-5653-2173-4-7046547-18-16-286181-105-142572 --arch-mlp-bot 13-512-256-64-16 --arch-mlp-top 512-256-1
diff --git a/examples/cpp/DLRM/run_random.sh b/examples/cpp/DLRM/run_random.sh
deleted file mode 100755
index e3c92b2698..0000000000
--- a/examples/cpp/DLRM/run_random.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/bash
-
-per_gpu_batch_size=256
-numgpu="$1"
-batchsize=$((numgpu * per_gpu_batch_size))
-
-#./dlrm -ll:gpu ${numgpu} -ll:cpu 4 -ll:fsize 12000 -ll:zsize 20000 -ll:util ${numgpu} --arch-sparse-feature-size 64 --arch-embedding-size 1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000 --arch-mlp-bot 64-512-512-64 --arch-mlp-top 576-1024-1024-1024-1 --epochs 20 --batch-size ${batchsize} -dm:memoize --strategy ../../src/runtime/dlrm_strategy_8nEmb_${numgpu}cpu_${numgpu}gpu.pb
-# ./dlrm -ll:gpu ${numgpu} -ll:cpu 4 -ll:fsize 12000 -ll:zsize 20000 -ll:util ${numgpu} --arch-sparse-feature-size 64 --arch-embedding-size 1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000 --arch-mlp-bot 64-512-512-64 --arch-mlp-top 576-1024-1024-1024-1 --epochs 20 --batch-size ${batchsize} -dm:memorize --strategy ../../src/runtime/dlrm_strategy_8nEmb_${numgpu}cpu_${numgpu}gpu.pb
-./dlrm -ll:gpu "${numgpu}" -ll:cpu 4 -ll:fsize 12000 -ll:zsize 20000 -ll:util "${numgpu}" --arch-sparse-feature-size 64 --arch-embedding-size 1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000 --arch-mlp-bot 64-512-512-64 --arch-mlp-top 576-1024-1024-1024-1 --epochs 20 --batch-size "${batchsize}" --data-size ${batchsize}*4 -dm:memorize --strategy ../../src/runtime/dlrm_strategy_emb_1_gpu_"${numgpu}"_node_1.pb
-#./dlrm -ll:gpu ${numgpu} -ll:cpu 8 -ll:fsize 12000 -ll:zsize 20000 -ll:util ${numgpu} --arch-sparse-feature-size 64 --arch-embedding-size 1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000 --arch-mlp-bot 64-512-512-64 --arch-mlp-top 576-1024-1024-1024-1 --epochs 20 --batch-size ${batchsize} -dm:memorize --strategy ../../src/runtime/dlrm_strategy_${numgpu}gpus.pb
diff --git a/examples/cpp/DLRM/run_summit.sh b/examples/cpp/DLRM/run_summit.sh
deleted file mode 100755
index d54c9295a1..0000000000
--- a/examples/cpp/DLRM/run_summit.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-
-numgpu="$1"
-numnodes="$2"
-per_gpu_batch_size=1024
-totalgpus=$((numgpu * numnodes))
-batchsize=$((totalgpus * per_gpu_batch_size))
-
-# 2 Embedding Tables
-#LEGION_FREEZE_ON_ERROR=1 jsrun -n ${numnodes} -a 1 -r 1 -c 24 -g 6 --bind rs ./dlrm -ll:gpu ${numgpu} -ll:cpu 1 -ll:fsize 15000 -ll:zsize 20000 -ll:util 6 -ll:bgwork 12 --arch-sparse-feature-size 256 --arch-embedding-size 1000000-1000000 --arch-mlp-bot 2048-2048-2048-2048-2048-2048-2048-2048-2048 --arch-mlp-top 4096-4096-4096-4096-4096 --epochs 100 --batch-size ${batchsize} --nodes ${numnodes} --control-replication #--import $MEMBERWORK/csc335/dlrm_profiles/dlrm.strategy #-lg:prof ${numnodes} -lg:prof_logfile $MEMBERWORK/csc335/dlrm_profiles/prof_%.gz
-
-# 8 Embedding Tables
-#LEGION_FREEZE_ON_ERROR=1 jsrun -n ${numnodes} -a 1 -r 1 -c 24 -g 6 --bind rs ./dlrm -ll:gpu ${numgpu} -ll:cpu 1 -ll:fsize 15000 -ll:zsize 20000 -ll:util 6 -ll:bgwork 12 --arch-sparse-feature-size 256 --arch-embedding-size 1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000 --arch-mlp-bot 2048-2048-2048-2048-2048-2048-2048-2048-2048 --arch-mlp-top 4096-4096-4096-4096-4096 --epochs 100 --batch-size ${batchsize} --nodes ${numnodes} --control-replication --budget 1000 --simulator-workspace-size 5093657088 #--import $MEMBERWORK/csc335/dlrm_profiles/dlrm.strategy #-lg:prof ${numnodes} -lg:prof_logfile $MEMBERWORK/csc335/dlrm_profiles/prof_%.gz
-LEGION_FREEZE_ON_ERROR=1 jsrun -n "${numnodes}" -a 1 -r 1 -c 24 -g 6 --bind rs ./dlrm -ll:gpu "${numgpu}" -ll:cpu 1 -ll:fsize 15000 -ll:zsize 20000 -ll:util 6 -ll:bgwork 12 --arch-sparse-feature-size 256 --arch-embedding-size 1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000 --arch-mlp-bot 1024-1024-1024-1024-1024-1024-1024-1024-1024 --arch-mlp-top 1024-1024-1024-1024-1024 --epochs 100 --batch-size "${batchsize}" --nodes "${numnodes}" --control-replication --budget 1000 --simulator-workspace-size 5093657088 #--import $MEMBERWORK/csc335/dlrm_profiles/dlrm.strategy #-lg:prof ${numnodes} -lg:prof_logfile $MEMBERWORK/csc335/dlrm_profiles/prof_%.gz
-
-# 24 Embedding Tables
-#GASNET_BACKTRACE=1 jsrun -n ${numnodes} -a 1 -r 1 -c 24 -g 6 --bind rs ./dlrm -ll:gpu ${numgpu} -ll:cpu 1 -ll:fsize 14000 -ll:zsize 20000 -ll:util 12 -ll:dma 4 --arch-sparse-feature-size 64 --arch-embedding-size 1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000 --arch-mlp-bot 64-512-512-64 --arch-mlp-top 576-1024-1024-1024-1 --epochs 5 --batch-size ${batchsize} -dm:memoize --nodes ${numnodes} -lg:prof ${numnodes} -lg:prof_logfile $MEMBERWORK/csc335/dlrm_profiles/prof_%.gz
-
-#--strategy ../../src/runtime/dlrm_strategy_${totalgpus}gpus.pb
-
diff --git a/examples/cpp/DLRM/run_summit_large.sh b/examples/cpp/DLRM/run_summit_large.sh
deleted file mode 100755
index 023b7b80c9..0000000000
--- a/examples/cpp/DLRM/run_summit_large.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/bin/bash
-
-numgpu="$1"
-numnodes="$2"
-per_gpu_batch_size=256
-totalgpus=$((numgpu * numnodes))
-batchsize=$((totalgpus * per_gpu_batch_size))
-
-# 24 Embedding Tables
-#GASNET_BACKTRACE=1 jsrun -n ${numnodes} -a 1 -r 1 -c 24 -g 6 --bind rs ./dlrm -ll:gpu ${numgpu} -ll:cpu 1 -ll:fsize 14000 -ll:zsize 20000 -ll:util 6 -ll:dma 4 -dm:memoize --embedding-bag-size 100 --arch-sparse-feature-size 64 --arch-embedding-size 1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000 --arch-mlp-bot 2048-4096-4096-4096-4096-4096 --arch-mlp-top 10240-4096-4096-4096-4096-1 --epochs 5 --batch-size ${batchsize} --nodes ${numnodes} -lg:prof ${numnodes} -lg:prof_logfile $MEMBERWORK/csc335/dlrm_profiles/prof_%.gz --strategy ../../src/runtime/dlrm_strategy_${totalgpus}gpus.pb 
-
-# 6 Embedding Tables
-LEGION_FREEZE_ON_ERROR=1 GASNET_BACKTRACE=1 jsrun -n "${numnodes}" -a 1 -r 1 -c 24 -g 6 --bind rs ./dlrm -ll:gpu "${numgpu}" -ll:cpu 1 -ll:fsize 14000 -ll:zsize 20000 -ll:util 6 -lg:parallel_replay 6 -ll:dma 4 -dm:memoize --embedding-bag-size 100 --arch-sparse-feature-size 64 --arch-embedding-size 1000000-1000000-1000000-1000000-1000000-1000000 --arch-mlp-bot 2048-4096-4096-4096-4096-4096 --arch-mlp-top 10240-4096-4096-4096-4096-1 --epochs 5 --batch-size "${batchsize}" --nodes "${numnodes}" -lg:prof "${numnodes}" -lg:prof_logfile "$MEMBERWORK/csc335/dlrm_profiles/prof_%.gz" --strategy "../../src/runtime/dlrm_strategy_gpu_${numgpu}_node_${numnodes}.pb"
-
-# 1 Embedding Table
-#GASNET_BACKTRACE=1 jsrun -n ${numnodes} -a 1 -r 1 -c 24 -g 6 --bind rs ./dlrm -ll:gpu ${numgpu} -ll:cpu 1 -ll:fsize 14000 -ll:zsize 20000 -ll:util 6 -ll:dma 4 -dm:memoize --embedding-bag-size 100 --arch-sparse-feature-size 64 --arch-embedding-size 10000000 --arch-mlp-bot 2048-4096-4096-4096-4096-4096 --arch-mlp-top 4160-4096-4096-4096-4096-4096-4096-4096-4096-4096-1 --epochs 5 --batch-size ${batchsize} --nodes ${numnodes}
diff --git a/examples/cpp/DLRM/strategies/dlrm_strategy.cc b/examples/cpp/DLRM/strategies/dlrm_strategy.cc
deleted file mode 100644
index a7fab8c3a8..0000000000
--- a/examples/cpp/DLRM/strategies/dlrm_strategy.cc
+++ /dev/null
@@ -1,359 +0,0 @@
-/* --------------------------
-//// THIS CODE IS AUTOGENERATED
-//// BY dlrm_strategy.py
-//// DO NOT MODIFY!!!
---------------------------*/
-
-#include "strategy.pb.h"
-#include <fstream>
-#include <iostream>
-
-class FFStrategy {
-public:
-  FFStrategy(int _gpus_per_node, int _embs_per_node, int _num_nodes);
-  bool add_conv_config(std::string const &name,
-                       std::string const &device_type,
-                       int num_par_n,
-                       int num_par_c,
-                       std::string const &input_memory,
-                       std::string const &output_memory);
-  FFProtoBuf::Op_DeviceType to_device_type(std::string const &name) {
-    if (name == "gpu" || name == "GPU") {
-      return FFProtoBuf::Op_DeviceType_GPU;
-    } else if (name == "cpu" || name == "CPU") {
-      return FFProtoBuf::Op_DeviceType_CPU;
-    } else {
-      assert(false);
-    }
-    return FFProtoBuf::Op_DeviceType_GPU;
-  }
-
-  FFProtoBuf::Op_MemoryType to_memory_type(std::string const &name) {
-    if (name == "gpu_memory" || name == "FBM") {
-      return FFProtoBuf::Op_MemoryType_FBM;
-    } else if (name == "cpu_memory" || name == "ZCM") {
-      return FFProtoBuf::Op_MemoryType_ZCM;
-    } else {
-      assert(false);
-    }
-    return FFProtoBuf::Op_MemoryType_FBM;
-  }
-  bool add_embed_config(std::string const &name,
-                        std::string const &device_type,
-                        std::string const &input_memory_type,
-                        std::string const &weight_memory_type,
-                        std::string const &output_memory_type,
-                        int gpu_id);
-  bool add_concat_config(std::string const &name,
-                         std::string const &device_type,
-                         std::string const &input_memory_type,
-                         std::string const &output_memory_type,
-                         int num_parts_sample,
-                         std::vector<int> const &device_ids);
-  bool add_batch_matmul_config(std::string const &name,
-                               std::string const &device_type,
-                               std::string const &input1_memory_type,
-                               std::string const &input2_memory_type,
-                               std::string const &output_memory_type,
-                               int num_parts_sample,
-                               std::vector<int> const &device_ids);
-  bool add_linear_config(std::string const &name,
-                         std::string const &device_type,
-                         std::string const &input_memory_type,
-                         std::string const &weight_memory_type,
-                         std::string const &output_memory_type,
-                         int num_parts_channel,
-                         int num_parts_sample,
-                         std::vector<int> const &device_ids);
-  bool add_mse_config(std::string const &name,
-                      std::string const &device_type,
-                      std::string const &input_memory_type,
-                      int num_parts_batch,
-                      std::vector<int> const &device_ids);
-  bool add_transpose_config(std::string const &name,
-                            std::string const &device_type,
-                            std::string const &input_memory_type,
-                            std::string const &output_memory_type,
-                            int num_parts_sample,
-                            std::vector<int> const &device_ids);
-
-  void export_file(std::string const &file);
-
-private:
-  int gpus_per_node, embs_per_node, num_nodes;
-  FFProtoBuf::Strategy strategy;
-};
-
-FFStrategy::FFStrategy(int _gpus_per_node, int _num_nodes, int _embs_per_node)
-    : gpus_per_node(_gpus_per_node), num_nodes(_num_nodes),
-      embs_per_node(_embs_per_node) {
-  if (_gpus_per_node <= 0 || _num_nodes <= 0 || _embs_per_node <= 0) {
-    printf("Invalide input configurations...\n");
-    exit(0);
-  }
-}
-
-bool FFStrategy::add_embed_config(std::string const &name,
-                                  std::string const &device_type,
-                                  std::string const &input_memory_type,
-                                  std::string const &weight_memory_type,
-                                  std::string const &output_memory_type,
-                                  int gpu_id) {
-  FFProtoBuf::Op *op = strategy.add_ops();
-  op->set_name(name);
-  op->set_device_type(to_device_type(device_type));
-  op->add_memory_types(to_memory_type(input_memory_type));
-  op->add_memory_types(to_memory_type(weight_memory_type));
-  op->add_memory_types(to_memory_type(output_memory_type));
-  op->add_dims(1);
-  op->add_dims(1);
-  for (int j = 0; j < 1; j++) {
-    op->add_device_ids(gpu_id);
-  }
-  return true;
-}
-
-bool FFStrategy::add_concat_config(std::string const &name,
-                                   std::string const &device_type,
-                                   std::string const &input_memory_type,
-                                   std::string const &output_memory_type,
-                                   int num_parts_sample,
-                                   std::vector<int> const &device_ids) {
-  FFProtoBuf::Op *op = strategy.add_ops();
-  op->set_name(name);
-  op->set_device_type(to_device_type(device_type));
-  op->add_memory_types(to_memory_type(input_memory_type));
-  op->add_memory_types(to_memory_type(output_memory_type));
-  op->add_dims(1);
-  op->add_dims(num_parts_sample);
-  assert(num_parts_sample == (int)device_ids.size());
-  for (int i = 0; i < num_parts_sample; i++) {
-    op->add_device_ids(device_ids[i]);
-  }
-}
-
-bool FFStrategy::add_batch_matmul_config(std::string const &name,
-                                         std::string const &device_type,
-                                         std::string const &input1_memory_type,
-                                         std::string const &input2_memory_type,
-                                         std::string const &output_memory_type,
-                                         int num_parts_sample,
-                                         std::vector<int> const &device_ids) {
-  FFProtoBuf::Op *op = strategy.add_ops();
-  op->set_name(name);
-  op->set_device_type(to_device_type(device_type));
-  op->add_memory_types(to_memory_type(input1_memory_type));
-  op->add_memory_types(to_memory_type(input2_memory_type));
-  op->add_memory_types(to_memory_type(output_memory_type));
-  op->add_dims(1);                // m
-  op->add_dims(1);                // n
-  op->add_dims(num_parts_sample); // d
-  assert(num_parts_sample == (int)device_ids.size());
-  for (int i = 0; i < num_parts_sample; i++) {
-    op->add_device_ids(device_ids[i]);
-  }
-}
-
-bool FFStrategy::add_transpose_config(std::string const &name,
-                                      std::string const &device_type,
-                                      std::string const &input_memory_type,
-                                      std::string const &output_memory_type,
-                                      int num_parts_sample,
-                                      std::vector<int> const &device_ids) {
-  FFProtoBuf::Op *op = strategy.add_ops();
-  op->set_name(name);
-  op->set_device_type(to_device_type(device_type));
-  op->add_memory_types(to_memory_type(input_memory_type));
-  op->add_memory_types(to_memory_type(output_memory_type));
-  op->add_dims(1);                // k
-  op->add_dims(1);                // m
-  op->add_dims(num_parts_sample); // d
-  assert(num_parts_sample == (int)device_ids.size());
-  for (int i = 0; i < num_parts_sample; i++) {
-    op->add_device_ids(device_ids[i]);
-  }
-}
-
-bool FFStrategy::add_linear_config(std::string const &name,
-                                   std::string const &device_type,
-                                   std::string const &input_memory_type,
-                                   std::string const &weight_memory_type,
-                                   std::string const &output_memory_type,
-                                   int num_parts_channel,
-                                   int num_parts_sample,
-                                   std::vector<int> const &device_ids) {
-  FFProtoBuf::Op *op = strategy.add_ops();
-  op->set_name(name);
-  op->set_device_type(to_device_type(device_type));
-  op->add_memory_types(to_memory_type(input_memory_type));
-  op->add_memory_types(to_memory_type(weight_memory_type));
-  op->add_memory_types(to_memory_type(output_memory_type));
-  op->add_dims(num_parts_channel); // m
-  op->add_dims(num_parts_sample);  // n
-  assert(num_parts_sample * num_parts_channel == (int)device_ids.size());
-  for (int i = 0; i < num_parts_channel * num_parts_sample; i++) {
-    op->add_device_ids(device_ids[i]);
-  }
-}
-
-bool FFStrategy::add_mse_config(std::string const &name,
-                                std::string const &device_type,
-                                std::string const &input_memory_type,
-                                int num_parts_sample,
-                                std::vector<int> const &device_ids) {
-  FFProtoBuf::Op *op = strategy.add_ops();
-  op->set_name(name);
-  op->set_device_type(to_device_type(device_type));
-  op->add_memory_types(to_memory_type(input_memory_type));
-  op->add_dims(1);
-  op->add_dims(num_parts_sample);
-  assert(num_parts_sample == (int)device_ids.size());
-  for (int i = 0; i < num_parts_sample; i++) {
-    op->add_device_ids(device_ids[i]);
-  }
-}
-
-void FFStrategy::export_file(std::string const &output) {
-  std::fstream outputFile(output.c_str(), std::ios::out | std::ios::trunc);
-  strategy.SerializeToOstream(&outputFile);
-}
-
-void parse_input_args(char **argv,
-                      int argc,
-                      int &gpus_per_node,
-                      int &embs_per_node,
-                      int &num_nodes) {
-  for (int i = 1; i < argc; i++) {
-    if (!strcmp(argv[i], "--gpu")) {
-      gpus_per_node = std::atoi(argv[++i]);
-      continue;
-    }
-    if (!strcmp(argv[i], "--node")) {
-      num_nodes = std::atoi(argv[++i]);
-      continue;
-    }
-    if (!strcmp(argv[i], "--emb")) {
-      embs_per_node = std::atoi(argv[++i]);
-      continue;
-    }
-  }
-}
-
-int main(int argc, char **argv) {
-  GOOGLE_PROTOBUF_VERIFY_VERSION;
-  int gpus_per_node = 0, embs_per_node = 0, num_nodes = 0;
-  parse_input_args(argv, argc, gpus_per_node, embs_per_node, num_nodes);
-  printf("Number of GPUs Per Node = %d\n", gpus_per_node);
-  printf("Number of Nodes = %d\n", num_nodes);
-  printf("Number of Embeddings Per Node = %d\n", embs_per_node);
-  FFStrategy strategy(gpus_per_node, embs_per_node, num_nodes);
-  // Embedding
-  for (int i = 0; i < num_nodes * embs_per_node; i++) {
-    std::string name = "embedding" + std::to_string(i);
-    strategy.add_embed_config(name,
-                              "GPU",
-                              "FBM" /*input*/,
-                              "FBM" /*weight*/,
-                              "FBM" /*output*/,
-                              i % (gpus_per_node * num_nodes));
-  }
-  {
-    std::vector<int> device_ids;
-    for (int i = 0; i < num_nodes; i++) {
-      device_ids.push_back(i * gpus_per_node);
-    }
-    strategy.add_concat_config("concat",
-                               "GPU",
-                               "FBM" /*input*/,
-                               "FBM" /*output*/,
-                               num_nodes,
-                               device_ids);
-  }
-  {
-    std::vector<int> device_ids;
-    for (int i = 0; i < num_nodes * gpus_per_node; i++) {
-      device_ids.push_back(i);
-    }
-    strategy.add_batch_matmul_config("batch_matmul",
-                                     "GPU",
-                                     "FBM" /*input1*/,
-                                     "FBM" /*input2*/,
-                                     "FBM" /*output*/,
-                                     num_nodes * gpus_per_node,
-                                     device_ids);
-  }
-  {
-    std::vector<int> device_ids;
-    for (int i = 0; i < num_nodes * gpus_per_node; i++) {
-      device_ids.push_back(i);
-    }
-    strategy.add_transpose_config("transpose",
-                                  "GPU",
-                                  "FBM" /*input*/,
-                                  "FBM" /*output*/,
-                                  num_nodes * gpus_per_node,
-                                  device_ids);
-  }
-  {
-    std::vector<int> device_ids;
-    for (int i = 0; i < num_nodes * gpus_per_node; i++) {
-      device_ids.push_back(i);
-    }
-    strategy.add_linear_config("linear",
-                               "GPU",
-                               "FBM" /*input*/,
-                               "FBM" /*weight*/,
-                               "FBM" /*output*/,
-                               1,
-                               num_nodes * gpus_per_node,
-                               device_ids);
-  }
-  {
-    std::vector<int> device_ids;
-    for (int i = 0; i < num_nodes * gpus_per_node; i++) {
-      device_ids.push_back(i);
-    }
-    strategy.add_mse_config("mse_loss",
-                            "GPU",
-                            "FBM" /*input*/,
-                            num_nodes * gpus_per_node,
-                            device_ids);
-  }
-  std::string output = "dlrm_strategy_emb_" + std::to_string(embs_per_node) +
-                       "_gpu_" + std::to_string(gpus_per_node) + "_node_" +
-                       std::to_string(num_nodes) + ".pb";
-  strategy.export_file(output);
-  output = "dlrm_strategy_" + std::to_string(emb) + "embs_" +
-           std::to_string(gpu) + "gpus.pb";
-  strategy.export_file(output);
-  google::protobuf::ShutdownProtobufLibrary();
-  /*
-  // merge conflicts from
-  https://github.com/facebookresearch/DLRM-FlexFlow/commit/20b9d365f993e06a8576d3f57eab3b83b32ad5dd#diff-287e90a56be12d8500e1226c319b5267
-  for (int i = 0; i < emb; i++) {
-    std::string name = "embedding"+std::to_string(i);
-    FFProtoBuf::Op* op = strategy.add_ops();
-    op->set_name(name);
-    op->set_device_type(FFProtoBuf::Op_DeviceType_GPU);
-    op->add_dims(1);
-    op->add_dims(1);
-    op->add_device_ids(i % gpu);
-  }
-  std::vector<std::string> names;
-  names.push_back("linear");
-  names.push_back("mse_loss");
-  names.push_back("concat");
-  for (size_t i = 0; i < names.size(); i++) {
-    FFProtoBuf::Op* op = strategy.add_ops();
-    op->set_name(names[i]);
-    op->set_device_type(FFProtoBuf::Op_DeviceType_GPU);
-    op->add_dims(1);
-    op->add_dims(gpu);
-    for (int j = 0; j < gpu; j++)
-      op->add_device_ids(j);
-  }
-  std::string output = "dlrm_strategy_" + std::to_string(emb) + "embs_" +
-  std::to_string(gpu) + "gpus.pb"; std::fstream outputFile(output.c_str(),
-  std::ios::out | std::ios::trunc); strategy.SerializeToOstream(&outputFile);
-  */
-}
diff --git a/examples/cpp/DLRM/strategies/dlrm_strategy.py b/examples/cpp/DLRM/strategies/dlrm_strategy.py
deleted file mode 100644
index b8f7de82fe..0000000000
--- a/examples/cpp/DLRM/strategies/dlrm_strategy.py
+++ /dev/null
@@ -1,68 +0,0 @@
-import argparse
-import sys
-
-def strategy_code(opts):
-    code = []
-
-    code.append("/* --------------------------")
-    code.append("//// THIS CODE IS AUTOGENERATED")
-    code.append("//// BY {}".format(sys.argv[0]))
-    code.append("//// DO NOT MODIFY!!!")
-    code.append("--------------------------*/\n")
-
-
-    code.append("""
-#include "strategy.pb.h"
-#include <fstream>
-#include <iostream>
-
-int main()
-{{
-  int gpu = {num_gpu};
-  int emb = {num_emb};
-  GOOGLE_PROTOBUF_VERIFY_VERSION;
-  FFProtoBuf::Strategy strategy;
-  // Embedding
-  for (int i = 0; i < emb; i++) {{
-    std::string name = "embedding"+std::to_string(i);
-    FFProtoBuf::Op* op = strategy.add_ops();
-    op->set_name(name);
-    op->set_device_type(FFProtoBuf::Op_DeviceType_GPU);
-    op->add_dims(1);
-    op->add_dims(1);
-    op->add_device_ids(i % gpu);
-  }}
-  std::vector<std::string> names;
-  names.push_back("linear");
-  names.push_back("mse_loss");
-  names.push_back("concat");
-  for (size_t i = 0; i < names.size(); i++) {{
-    FFProtoBuf::Op* op = strategy.add_ops();
-    op->set_name(names[i]);
-    op->set_device_type(FFProtoBuf::Op_DeviceType_GPU);
-    op->add_dims(1);
-    op->add_dims(gpu);
-    for (int j = 0; j < gpu; j++)
-      op->add_device_ids(j);
-  }}
-  std::string output = "dlrm_strategy_" + std::to_string(emb) + "embs_" + std::to_string(gpu) + "gpus.pb";
-  std::fstream outputFile(output.c_str(), std::ios::out | std::ios::trunc);
-  strategy.SerializeToOstream(&outputFile);
-  google::protobuf::ShutdownProtobufLibrary();
-}}
-""".format(num_gpu=opts.num_gpu, num_emb=opts.num_emb))
-    return code
-
-# Main
-parser = argparse.ArgumentParser()
-parser.add_argument("-f", "--filename", help="file name")
-parser.add_argument("-g", "--num_gpu", help="number of GPUs")
-parser.add_argument("-e", "--num_emb", help="number of Embedding tables")
-opts = parser.parse_args()
-
-code = strategy_code(opts)
-with open(opts.filename, "w") as fout:
-    for c in code:
-        fout.write(c + "\n")
-
-print("Created " + opts.filename)
diff --git a/examples/cpp/DLRM/strategies/dlrm_strategy_16embs_16gpus.pb b/examples/cpp/DLRM/strategies/dlrm_strategy_16embs_16gpus.pb
deleted file mode 100644
index c2c030b12b..0000000000
Binary files a/examples/cpp/DLRM/strategies/dlrm_strategy_16embs_16gpus.pb and /dev/null differ
diff --git a/examples/cpp/DLRM/strategies/dlrm_strategy_16embs_8gpus.pb b/examples/cpp/DLRM/strategies/dlrm_strategy_16embs_8gpus.pb
deleted file mode 100644
index ac1c9d63f0..0000000000
Binary files a/examples/cpp/DLRM/strategies/dlrm_strategy_16embs_8gpus.pb and /dev/null differ
diff --git a/examples/cpp/DLRM/strategies/dlrm_strategy_8embs_8gpus.pb b/examples/cpp/DLRM/strategies/dlrm_strategy_8embs_8gpus.pb
deleted file mode 100644
index 8a34141c2b..0000000000
Binary files a/examples/cpp/DLRM/strategies/dlrm_strategy_8embs_8gpus.pb and /dev/null differ
diff --git a/examples/cpp/DLRM/strategies/dlrm_strategy_hetero.cc b/examples/cpp/DLRM/strategies/dlrm_strategy_hetero.cc
deleted file mode 100644
index a7172b56ea..0000000000
--- a/examples/cpp/DLRM/strategies/dlrm_strategy_hetero.cc
+++ /dev/null
@@ -1,57 +0,0 @@
-#include "strategy.pb.h"
-#include <fstream>
-#include <iostream>
-
-int main() {
-  int gpu = 1;
-  int cpu = 1;
-  int nemb = 8; // Assuming >gpu embeddings 1x per GPU and the rest distributed
-                // among available CPUs
-
-  GOOGLE_PROTOBUF_VERIFY_VERSION;
-  FFProtoBuf::Strategy strategy;
-
-  // Embedding
-  int ei = 0;
-#if 0
-  for (ei = 0; ei < std::min(gpu, nemb); ei++) {
-    std::string name = "embedding"+std::to_string(ei);
-    FFProtoBuf::Op* op = strategy.add_ops();
-    op->set_name(name);
-    op->set_device_type(FFProtoBuf::Op_DeviceType_GPU);
-    op->add_dims(1);
-    op->add_dims(1);
-    op->add_device_ids(ei);
-  }
-#endif
-
-  for (; ei < nemb; ei++) {
-    std::string name = "embedding" + std::to_string(ei);
-    FFProtoBuf::Op *op = strategy.add_ops();
-    op->set_name(name);
-    op->set_device_type(FFProtoBuf::Op_DeviceType_CPU);
-    op->add_dims(1);
-    op->add_dims(1);
-    op->add_device_ids(ei % cpu);
-  }
-  std::vector<std::string> names;
-  names.push_back("linear");
-  names.push_back("mse_loss");
-  names.push_back("concat");
-  for (size_t i = 0; i < names.size(); i++) {
-    FFProtoBuf::Op *op = strategy.add_ops();
-    op->set_name(names[i]);
-    op->set_device_type(FFProtoBuf::Op_DeviceType_GPU);
-    op->add_dims(1);
-    op->add_dims(gpu);
-    for (int j = 0; j < gpu; j++) {
-      op->add_device_ids(j);
-    }
-  }
-  std::string output = "dlrm_strategy_" + std::to_string(nemb) + "nEmb_" +
-                       std::to_string(cpu) + "cpu_" + std::to_string(gpu) +
-                       "gpu.pb";
-  std::fstream outputFile(output.c_str(), std::ios::out | std::ios::trunc);
-  strategy.SerializeToOstream(&outputFile);
-  google::protobuf::ShutdownProtobufLibrary();
-}
diff --git a/examples/cpp/DLRM/strategies/gen_strategy.sh b/examples/cpp/DLRM/strategies/gen_strategy.sh
deleted file mode 100755
index 43c6d1bd8a..0000000000
--- a/examples/cpp/DLRM/strategies/gen_strategy.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-
-if [ $# -ne 3 ]
-then
-	echo "Need 3 arguments (per-node-gpu per-node_emb num_nodes)"
-	exit
-fi
-
-pngpus=$1
-pnembs=$2
-nnodes=$3
-
-echo "Compile..."
-g++ dlrm_strategy.cc strategy.pb.cc -o generator -std=c++11 -L"${PROTOBUF}"/src/.libs -lprotobuf -L/usr/local/lib -I/usr/local/include -I"${PROTOBUF}"/src -pthread -O2
-
-echo "Generate..."
-./generator --gpu "${pngpus}" --emb "${pnembs}" --node "${nnodes}"
-
-echo "Done. dlrm_strategy_emb_${pnembs}_gpu_${pngpus}_node_${nnodes}.pb"
-
diff --git a/examples/cpp/InceptionV3/CMakeLists.txt b/examples/cpp/InceptionV3/CMakeLists.txt
deleted file mode 100644
index 89a33ce907..0000000000
--- a/examples/cpp/InceptionV3/CMakeLists.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-cmake_minimum_required(VERSION 3.10)
-
-project(FlexFlowExample_Inception)
-set(project_target inception)
-
-set(CPU_SRC
-  ${FLEXFLOW_CPP_DRV_SRC}
-  inception.cc
-  inception.h)
-
-set(GPU_SRC
-  inception.cu)
-
-cuda_add_executable(${project_target} ${CPU_SRC} ${GPU_SRC})
-target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR})
-target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES})
-
-set(BIN_DEST "bin")
-install(TARGETS ${project_target} DESTINATION ${BIN_DEST})
diff --git a/examples/cpp/InceptionV3/Makefile b/examples/cpp/InceptionV3/Makefile
deleted file mode 100644
index 8f8e234cb9..0000000000
--- a/examples/cpp/InceptionV3/Makefile
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Flags for directing the runtime makefile what to include
-DEBUG           ?= 0		# Include debugging symbols
-MAX_DIM         ?= 4		# Maximum number of dimensions
-OUTPUT_LEVEL    ?= LEVEL_DEBUG	# Compile time logging level
-USE_CUDA        ?= 1		# Include CUDA support (requires CUDA)
-USE_GASNET      ?= 0		# Include GASNet support (requires GASNet)
-USE_HDF         ?= 1		# Include HDF5 support (requires HDF5)
-ALT_MAPPERS     ?= 0		# Include alternative mappers (not recommended)
-
-# Put the binary file name here
-OUTFILE		?= inception
-# List all the application source files here
-GEN_SRC		= inception.cc
-GEN_GPU_SRC	= inception.cu
-
-ifndef FF_HOME
-$(error FF_HOME variable is not defined, aborting build)
-endif
-
-include $(FF_HOME)/FlexFlow.mk
diff --git a/examples/cpp/InceptionV3/inception.cc b/examples/cpp/InceptionV3/inception.cc
deleted file mode 100644
index 1bcb9a0e57..0000000000
--- a/examples/cpp/InceptionV3/inception.cc
+++ /dev/null
@@ -1,233 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "inception.h"
-#include <fstream>
-#include <sstream>
-#include <string>
-
-using namespace Legion;
-using namespace FlexFlow;
-
-LegionRuntime::Logger::Category log_app("Inceptionv3");
-
-Tensor InceptionA(FFModel &ff, Tensor input, int pool_features) {
-  Tensor t1 = input;
-  t1 = ff.conv2d(t1, 64, 1, 1, 1, 1, 0, 0, AC_MODE_RELU);
-
-  Tensor t2 = ff.conv2d(input, 48, 1, 1, 1, 1, 0, 0, AC_MODE_RELU);
-  t2 = ff.conv2d(t2, 64, 5, 5, 1, 1, 2, 2, AC_MODE_RELU);
-
-  Tensor t3 = ff.conv2d(input, 64, 1, 1, 1, 1, 0, 0, AC_MODE_RELU);
-  t3 = ff.conv2d(t3, 96, 3, 3, 1, 1, 1, 1, AC_MODE_RELU);
-  t3 = ff.conv2d(t3, 96, 3, 3, 1, 1, 1, 1, AC_MODE_RELU);
-
-  Tensor t4 = ff.pool2d(input, 3, 3, 1, 1, 1, 1, POOL_AVG);
-  t4 = ff.conv2d(t4, pool_features, 1, 1, 1, 1, 0, 0, AC_MODE_RELU);
-
-  Tensor concat[4];
-  concat[0] = t1;
-  concat[1] = t2;
-  concat[2] = t3;
-  concat[3] = t4;
-  Tensor output = ff.concat(4, concat, 1);
-
-  return output;
-}
-
-Tensor InceptionB(FFModel &ff, Tensor input) {
-  Tensor t1 = ff.conv2d(input, 384, 3, 3, 2, 2, 0, 0);
-  Tensor t2 = ff.conv2d(input, 64, 1, 1, 1, 1, 0, 0);
-  t2 = ff.conv2d(t2, 96, 3, 3, 1, 1, 1, 1);
-  t2 = ff.conv2d(t2, 96, 3, 3, 2, 2, 0, 0);
-  Tensor t3 = ff.pool2d(input, 3, 3, 2, 2, 0, 0);
-  Tensor concat[3];
-  concat[0] = t1;
-  concat[1] = t2;
-  concat[2] = t3;
-  Tensor output = ff.concat(3, concat, 1);
-  return output;
-}
-
-Tensor InceptionC(FFModel &ff, Tensor input, int channels) {
-  Tensor t1 = ff.conv2d(input, 192, 1, 1, 1, 1, 0, 0);
-  Tensor t2 = ff.conv2d(input, channels, 1, 1, 1, 1, 0, 0);
-  t2 = ff.conv2d(t2, channels, 1, 7, 1, 1, 0, 3);
-  t2 = ff.conv2d(t2, 192, 7, 1, 1, 1, 3, 0);
-  Tensor t3 = ff.conv2d(input, channels, 1, 1, 1, 1, 0, 0);
-  t3 = ff.conv2d(t3, channels, 7, 1, 1, 1, 3, 0);
-  t3 = ff.conv2d(t3, channels, 1, 7, 1, 1, 0, 3);
-  t3 = ff.conv2d(t3, channels, 7, 1, 1, 1, 3, 0);
-  t3 = ff.conv2d(t3, 192, 1, 7, 1, 1, 0, 3);
-  Tensor t4 = ff.pool2d(input, 3, 3, 1, 1, 1, 1, POOL_AVG);
-  t4 = ff.conv2d(t4, 192, 1, 1, 1, 1, 0, 0);
-  Tensor concat[4];
-  concat[0] = t1;
-  concat[1] = t2;
-  concat[2] = t3;
-  concat[3] = t4;
-  Tensor output = ff.concat(4, concat, 1);
-  return output;
-}
-
-Tensor InceptionD(FFModel &ff, Tensor input) {
-  Tensor t1 = ff.conv2d(input, 192, 1, 1, 1, 1, 0, 0);
-  t1 = ff.conv2d(t1, 320, 3, 3, 2, 2, 0, 0);
-  Tensor t2 = ff.conv2d(input, 192, 1, 1, 1, 1, 0, 0);
-  t2 = ff.conv2d(t2, 192, 1, 7, 1, 1, 0, 3);
-  t2 = ff.conv2d(t2, 192, 7, 1, 1, 1, 3, 0);
-  t2 = ff.conv2d(t2, 192, 3, 3, 2, 2, 0, 0);
-  Tensor t3 = ff.pool2d(input, 3, 3, 2, 2, 0, 0);
-  Tensor concat[3];
-  concat[0] = t1;
-  concat[1] = t2;
-  concat[2] = t3;
-  Tensor output = ff.concat(3, concat, 1);
-  return output;
-}
-
-Tensor InceptionE(FFModel &ff, Tensor input) {
-  Tensor t1 = ff.conv2d(input, 320, 1, 1, 1, 1, 0, 0);
-  Tensor t2i = ff.conv2d(input, 384, 1, 1, 1, 1, 0, 0);
-  Tensor t2 = ff.conv2d(t2i, 384, 1, 3, 1, 1, 0, 1);
-  Tensor t3 = ff.conv2d(t2i, 384, 3, 1, 1, 1, 1, 0);
-  Tensor t3i = ff.conv2d(input, 448, 1, 1, 1, 1, 0, 0);
-  t3i = ff.conv2d(t3i, 384, 3, 3, 1, 1, 1, 1);
-  Tensor t4 = ff.conv2d(t3i, 384, 1, 3, 1, 1, 0, 1);
-  Tensor t5 = ff.conv2d(t3i, 384, 3, 1, 1, 1, 1, 0);
-  Tensor t6 = ff.pool2d(input, 3, 3, 1, 1, 1, 1, POOL_AVG);
-  t6 = ff.conv2d(t6, 192, 1, 1, 1, 1, 0, 0);
-  Tensor concat[6];
-  concat[0] = t1;
-  concat[1] = t2;
-  concat[2] = t3;
-  concat[3] = t4;
-  concat[4] = t5;
-  concat[5] = t6;
-  Tensor output = ff.concat(6, concat, 1);
-  return output;
-}
-
-void FlexFlow::top_level_task(Task const *task,
-                              std::vector<PhysicalRegion> const &regions,
-                              Context ctx,
-                              Runtime *runtime) {
-  FFConfig ffConfig;
-  /* { */
-  /* const InputArgs &command_args = HighLevelRuntime::get_input_args(); */
-  /* char **argv = command_args.argv; */
-  /* int argc = command_args.argc; */
-  /* parse_input_args(argv, argc, inceptionConfig); */
-  log_app.print("batchSize(%d) workersPerNodes(%d) numNodes(%d)",
-                ffConfig.batchSize,
-                ffConfig.workersPerNode,
-                ffConfig.numNodes);
-  /* } */
-  FFModel ff(ffConfig);
-
-  Tensor input;
-  {
-    int const dims[] = {ffConfig.batchSize, 3, 299, 299};
-    input = ff.create_tensor<4>(dims, DT_FLOAT);
-  }
-  // Tensor label;
-  //{
-  //   const int dims[] = {ffConfig.batchSize, 1};
-  //   label = ff.create_tensor<2>(dims, DT_INT32);
-  // }
-
-  //-----------------------------------------------------------------
-  Tensor t = ff.conv2d(input, 32, 3, 3, 2, 2, 0, 0, AC_MODE_RELU);
-  t = ff.conv2d(t, 32, 3, 3, 1, 1, 0, 0, AC_MODE_RELU);
-  t = ff.conv2d(t, 64, 3, 3, 1, 1, 1, 1, AC_MODE_RELU);
-  t = ff.pool2d(t, 3, 3, 2, 2, 0, 0);
-  t = ff.conv2d(t, 80, 1, 1, 1, 1, 0, 0, AC_MODE_RELU);
-  t = ff.conv2d(t, 192, 3, 3, 1, 1, 1, 1, AC_MODE_RELU);
-  t = ff.pool2d(t, 3, 3, 2, 2, 0, 0);
-
-  t = InceptionA(ff, t, 32);
-  t = InceptionA(ff, t, 64);
-  t = InceptionA(ff, t, 64);
-  t = InceptionB(ff, t);
-  t = InceptionC(ff, t, 128);
-  t = InceptionC(ff, t, 160);
-  t = InceptionC(ff, t, 160);
-  t = InceptionC(ff, t, 192);
-  t = InceptionD(ff, t);
-  t = InceptionE(ff, t);
-  t = InceptionE(ff, t);
-  t = ff.pool2d(t, 8, 8, 1, 1, 0, 0, POOL_AVG);
-  t = ff.flat(t);
-  t = ff.dense(t, 10);
-  t = ff.softmax(t);
-  //-----------------------------------------------------------------
-  Optimizer *optimizer = new SGDOptimizer(&ff, 0.001f);
-  std::vector<MetricsType> metrics;
-  metrics.push_back(METRICS_ACCURACY);
-  metrics.push_back(METRICS_SPARSE_CATEGORICAL_CROSSENTROPY);
-  ff.compile(optimizer, LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, metrics);
-
-  // Data Loader
-  /* DataLoader data_loader(ff, inceptionConfig, input, ff.label_tensor); */
-  ff.init_operators();
-  // Start timer
-  {
-    runtime->issue_execution_fence(ctx);
-    TimingLauncher timer(MEASURE_MICRO_SECONDS);
-    Future future = runtime->issue_timing_measurement(ctx, timer);
-    future.get_void_result();
-  }
-  double ts_start = Realm::Clock::current_time_in_microseconds();
-  for (int epoch = 0; epoch < ffConfig.epochs; epoch++) {
-    /* data_loader.reset(); */
-    ff.reset_metrics();
-    /* int iterations = data_loader.num_samples / ffConfig.batchSize; */
-    int iterations = 128;
-
-    for (int iter = 0; iter < iterations; iter++) {
-      /* if (inceptionConfig.dataset_path.length() == 0) { */
-      /*   // Only load data once for random input */
-      /*   if (iter == 0 && epoch == 0) */
-      /*     data_loader.next_batch(ff); */
-      /* } else { */
-      /*   data_loader.next_batch(ff); */
-      /* } */
-      if (epoch > 0) {
-        runtime->begin_trace(ctx, 111 /*trace_id*/);
-      }
-      ff.forward();
-      ff.zero_gradients();
-      ff.backward();
-      ff.update();
-      if (epoch > 0) {
-        runtime->end_trace(ctx, 111 /*trace_id*/);
-      }
-    }
-  }
-  // End timer
-  {
-    runtime->issue_execution_fence(ctx);
-    TimingLauncher timer(MEASURE_MICRO_SECONDS);
-    Future future = runtime->issue_timing_measurement(ctx, timer);
-    future.get_void_result();
-  }
-  double ts_end = Realm::Clock::current_time_in_microseconds();
-  double run_time = 1e-6 * (ts_end - ts_start);
-  printf("ELAPSED TIME = %.4fs, THROUGHPUT = %.2f samples/s\n",
-         run_time,
-         8192 * ffConfig.epochs / run_time);
-}
-
-void FlexFlow::register_custom_tasks() {}
diff --git a/examples/cpp/InceptionV3/inception.cu b/examples/cpp/InceptionV3/inception.cu
deleted file mode 100644
index d44a164764..0000000000
--- a/examples/cpp/InceptionV3/inception.cu
+++ /dev/null
@@ -1,14 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
diff --git a/examples/cpp/InceptionV3/inception.h b/examples/cpp/InceptionV3/inception.h
deleted file mode 100644
index f4b85f8b9b..0000000000
--- a/examples/cpp/InceptionV3/inception.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "flexflow/model.h"
-#define MAX_NUM_SAMPLES 4196
-
-using namespace Legion;
-using namespace std;
diff --git a/examples/cpp/MLP_Unify/CMakeLists.txt b/examples/cpp/MLP_Unify/CMakeLists.txt
deleted file mode 100644
index 9609fbb375..0000000000
--- a/examples/cpp/MLP_Unify/CMakeLists.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-cmake_minimum_required(VERSION 3.10)
-
-project(FlexFlowExample_MLPUnify)
-set(project_target mlp_unify)
-
-set(CPU_SRC
-  ${FLEXFLOW_CPP_DRV_SRC}
-  mlp.cc)
-
-cuda_add_executable(${project_target} ${CPU_SRC})
-target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR})
-target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES})
-
-set(BIN_DEST "bin")
-install(TARGETS ${project_target} DESTINATION ${BIN_DEST})
diff --git a/examples/cpp/MLP_Unify/Makefile b/examples/cpp/MLP_Unify/Makefile
deleted file mode 100644
index 1299e72239..0000000000
--- a/examples/cpp/MLP_Unify/Makefile
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Flags for directing the runtime makefile what to include
-DEBUG           ?= 1		# Include debugging symbols
-MAX_DIM         ?= 5		# Maximum number of dimensions
-OUTPUT_LEVEL    ?= LEVEL_DEBUG	# Compile time logging level
-USE_CUDA        ?= 0		# Include CUDA support (requires CUDA)
-USE_GASNET      ?= 0		# Include GASNet support (requires GASNet)
-USE_HDF         ?= 0		# Include HDF5 support (requires HDF5)
-ALT_MAPPERS     ?= 0		# Include alternative mappers (not recommended)
-USE_HIP         ?= 1		# Include HIP support (requires HIP)
-HIP_TARGET      ?= ROCM
-USE_GPU_REDUCTIONS ?= 0
-
-# Put the binary file name here
-OUTFILE		?= mlp
-# List all the application source files here
-GEN_SRC		= mlp.cc
-GEN_GPU_SRC	=
-GEN_HIP_SRC     =
-
-ifndef FF_HOME
-$(error FF_HOME variable is not defined, aborting build)
-endif
-
-include $(FF_HOME)/FlexFlow.mk
diff --git a/examples/cpp/MLP_Unify/mlp.cc b/examples/cpp/MLP_Unify/mlp.cc
deleted file mode 100644
index eaedb1bb0b..0000000000
--- a/examples/cpp/MLP_Unify/mlp.cc
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "flexflow/model.h"
-#include <fstream>
-#include <sstream>
-#include <string>
-using namespace Legion;
-using namespace FlexFlow;
-
-void FlexFlow::top_level_task(Task const *task,
-                              std::vector<PhysicalRegion> const &regions,
-                              Context ctx,
-                              Runtime *runtime) {
-  FFConfig ffConfig;
-  fprintf(stderr,
-          "batchSize(%d) workersPerNodes(%d) numNodes(%d)\n",
-          ffConfig.batchSize,
-          ffConfig.workersPerNode,
-          ffConfig.numNodes);
-  FFModel ff(ffConfig);
-
-  std::vector<int> hidden_dims = {
-      8192, 8192, 8192, 8192, 8192, 8192, 8192, 8192};
-  Tensor input1, input2;
-  {
-    int const dims[] = {ffConfig.batchSize, 1024};
-    input1 = ff.create_tensor<2>(dims, DT_FLOAT);
-    input2 = ff.create_tensor<2>(dims, DT_FLOAT);
-  }
-  Tensor t1 = input1, t2 = input2;
-  for (size_t i = 0; i < hidden_dims.size(); i++) {
-    int const dims[] = {hidden_dims[i], t1->dims[0]};
-    ActiMode acti_mode =
-        (i + 1 == hidden_dims.size()) ? AC_MODE_NONE : AC_MODE_RELU;
-    t1 = ff.dense(t1, hidden_dims[i], acti_mode, false);
-    t2 = ff.dense(t2, hidden_dims[i], acti_mode, false);
-  }
-  Tensor t = ff.add(t1, t2);
-  t = ff.softmax(t);
-  Optimizer *optimizer = new SGDOptimizer(&ff, 0.001f);
-  std::vector<MetricsType> metrics;
-  metrics.push_back(METRICS_ACCURACY);
-  metrics.push_back(METRICS_SPARSE_CATEGORICAL_CROSSENTROPY);
-  ff.compile(optimizer, LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, metrics);
-  ff.init_operators();
-  // Start timer
-  {
-    runtime->issue_execution_fence(ctx);
-    TimingLauncher timer(MEASURE_MICRO_SECONDS);
-    Future future = runtime->issue_timing_measurement(ctx, timer);
-    future.get_void_result();
-  }
-  double ts_start = Realm::Clock::current_time_in_microseconds();
-  for (int epoch = 0; epoch < ffConfig.epochs; epoch++) {
-    ff.reset_metrics();
-    int iterations = 128;
-    for (int iter = 0; iter < iterations; iter++) {
-      runtime->begin_trace(ctx, 111 /*trace_id*/);
-      ff.forward();
-      ff.zero_gradients();
-      // ff.backward();
-      // ff.update();
-      runtime->end_trace(ctx, 111 /*trace_id*/);
-    }
-  }
-  // End timer
-  {
-    runtime->issue_execution_fence(ctx);
-    TimingLauncher timer(MEASURE_MICRO_SECONDS);
-    Future future = runtime->issue_timing_measurement(ctx, timer);
-    future.get_void_result();
-  }
-  double ts_end = Realm::Clock::current_time_in_microseconds();
-  double run_time = 1e-6 * (ts_end - ts_start);
-  printf("ELAPSED TIME = %.4fs, THROUGHPUT = %.2f samples/s\n",
-         run_time,
-         ffConfig.batchSize * 128 * ffConfig.epochs / run_time);
-}
-
-void FlexFlow::register_custom_tasks() {}
diff --git a/examples/cpp/ResNet/CMakeLists.txt b/examples/cpp/ResNet/CMakeLists.txt
deleted file mode 100644
index ddef1fc9f2..0000000000
--- a/examples/cpp/ResNet/CMakeLists.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-cmake_minimum_required(VERSION 3.10)
-
-project(FlexFlowExample_ResNet)
-set(project_target resnet)
-
-set(CPU_SRC
-  ${FLEXFLOW_CPP_DRV_SRC}
-  resnet.cc
-  resnet.h)
-
-set(GPU_SRC
-  resnet.cu)
-
-cuda_add_executable(${project_target} ${CPU_SRC} ${GPU_SRC})
-target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR})
-target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES})
-
-set(BIN_DEST "bin")
-install(TARGETS ${project_target} DESTINATION ${BIN_DEST})
diff --git a/examples/cpp/ResNet/Makefile b/examples/cpp/ResNet/Makefile
deleted file mode 100644
index e97bb667e4..0000000000
--- a/examples/cpp/ResNet/Makefile
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Flags for directing the runtime makefile what to include
-DEBUG           ?= 0		# Include debugging symbols
-MAX_DIM         ?= 4		# Maximum number of dimensions
-OUTPUT_LEVEL    ?= LEVEL_DEBUG	# Compile time logging level
-USE_CUDA        ?= 1		# Include CUDA support (requires CUDA)
-USE_GASNET      ?= 1		# Include GASNet support (requires GASNet)
-USE_HDF         ?= 0		# Include HDF5 support (requires HDF5)
-ALT_MAPPERS     ?= 0		# Include alternative mappers (not recommended)
-
-# Put the binary file name here
-OUTFILE		?= resnet
-# List all the application source files here
-GEN_SRC		= resnet.cc
-GEN_GPU_SRC	= resnet.cu
-
-ifndef FF_HOME
-$(error FF_HOME variable is not defined, aborting build)
-endif
-
-include $(FF_HOME)/FlexFlow.mk
diff --git a/examples/cpp/ResNet/resnet.cc b/examples/cpp/ResNet/resnet.cc
deleted file mode 100644
index 19bef3bf97..0000000000
--- a/examples/cpp/ResNet/resnet.cc
+++ /dev/null
@@ -1,418 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "resnet.h"
-#include <fstream>
-#include <sstream>
-#include <string>
-using namespace Legion;
-using FlexFlow::FFConfig;
-using FlexFlow::FFModel;
-using FlexFlow::Optimizer;
-using FlexFlow::SGDOptimizer;
-using FlexFlow::Tensor;
-
-LegionRuntime::Logger::Category log_app("ResNet");
-
-void parse_input_args(char **argv, int argc, ResNetConfig &config) {
-  for (int i = 1; i < argc; i++) {
-    if (!strcmp(argv[i], "--dataset")) {
-      config.dataset_path = std::string(argv[++i]);
-      continue;
-    }
-  }
-}
-
-Tensor
-    BottleneckBlock(FFModel &ff, Tensor input, int out_channels, int stride) {
-  Tensor t = ff.conv2d(input, out_channels, 1, 1, 1, 1, 0, 0, AC_MODE_NONE);
-  // t = ff.batch_norm(t);
-
-  t = ff.conv2d(t, out_channels, 3, 3, stride, stride, 1, 1, AC_MODE_NONE);
-  // t = ff.batch_norm(t);
-
-  t = ff.conv2d(t, 4 * out_channels, 1, 1, 1, 1, 0, 0);
-  // t = ff.batch_norm(t, false);
-
-  if ((stride > 1) || (input->dims[2] != out_channels * 4)) {
-    printf("input->dims = %d out_channels*4 = %d\n",
-           input->dims[2],
-           out_channels * 4);
-    input = ff.conv2d(
-        input, 4 * out_channels, 1, 1, stride, stride, 0, 0, AC_MODE_NONE);
-    // input = ff.batch_norm(input, false);
-  }
-  t = ff.add(input, t);
-  return ff.relu(t, false);
-}
-
-void FlexFlow::top_level_task(Task const *task,
-                              std::vector<PhysicalRegion> const &regions,
-                              Context ctx,
-                              Runtime *runtime) {
-  FFConfig ffConfig;
-  ResNetConfig resnetConfig;
-  {
-    InputArgs const &command_args = HighLevelRuntime::get_input_args();
-    char **argv = command_args.argv;
-    int argc = command_args.argc;
-    parse_input_args(argv, argc, resnetConfig);
-    log_app.print("batchSize(%d) workersPerNodes(%d) numNodes(%d)",
-                  ffConfig.batchSize,
-                  ffConfig.workersPerNode,
-                  ffConfig.numNodes);
-  }
-  FFModel ff(ffConfig);
-
-  Tensor input;
-  {
-    int const dims[] = {ffConfig.batchSize, 3, 229, 229};
-    input = ff.create_tensor<4>(dims, DT_FLOAT);
-  }
-  // Tensor label;
-  // {
-  //   const int dims[] = {ffConfig.batchSize, 1};
-  //   label = ff.create_tensor<2>(dims, DT_INT32);
-  // }
-  // Add layers
-  Tensor t = input;
-  t = ff.conv2d(input, 64, 7, 7, 2, 2, 3, 3);
-  // t = ff.batch_norm(t);
-  t = ff.pool2d(t, 3, 3, 2, 2, 1, 1);
-
-  for (int i = 0; i < 3; i++) {
-    t = BottleneckBlock(ff, t, 64, 1);
-  }
-  for (int i = 0; i < 4; i++) {
-    int stride = (i == 0) ? 2 : 1;
-    t = BottleneckBlock(ff, t, 128, stride);
-  }
-  for (int i = 0; i < 6; i++) {
-    int stride = (i == 0) ? 2 : 1;
-    t = BottleneckBlock(ff, t, 256, stride);
-  }
-  for (int i = 0; i < 3; i++) {
-    int stride = (i == 0) ? 2 : 1;
-    t = BottleneckBlock(ff, t, 512, stride);
-  }
-  t = ff.pool2d(t, 7, 7, 1, 1, 0, 0, POOL_AVG);
-  t = ff.flat(t);
-  t = ff.dense(t, 10);
-  t = ff.softmax(t);
-  Optimizer *optimizer = new SGDOptimizer(&ff, 0.001f);
-  std::vector<MetricsType> metrics;
-  metrics.push_back(METRICS_ACCURACY);
-  metrics.push_back(METRICS_SPARSE_CATEGORICAL_CROSSENTROPY);
-  ff.compile(optimizer, LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, metrics);
-  // Data Loader
-  /* DataLoader data_loader(ff, resnetConfig, input, ff.label_tensor); */
-  ff.init_operators();
-  // Start timer
-  {
-    runtime->issue_execution_fence(ctx);
-    TimingLauncher timer(MEASURE_MICRO_SECONDS);
-    Future future = runtime->issue_timing_measurement(ctx, timer);
-    future.get_void_result();
-  }
-  double ts_start = Realm::Clock::current_time_in_microseconds();
-  for (int epoch = 0; epoch < ffConfig.epochs; epoch++) {
-    /* data_loader.reset(); */
-    ff.reset_metrics();
-    int iterations = 128; // data_loader.num_samples / ffConfig.batchSize;
-
-    for (int iter = 0; iter < iterations; iter++) {
-      if (resnetConfig.dataset_path.length() == 0) {
-        // Only load data once for random input
-        // if (iter == 0 && epoch == 0)
-        //  data_loader.next_batch(ff);
-      } else {
-        // data_loader.next_batch(ff);
-      }
-      runtime->begin_trace(ctx, 111 /*trace_id*/);
-      ff.forward();
-      ff.zero_gradients();
-      ff.backward();
-      ff.update();
-      runtime->end_trace(ctx, 111 /*trace_id*/);
-    }
-  }
-  // End timer
-  {
-    runtime->issue_execution_fence(ctx);
-    TimingLauncher timer(MEASURE_MICRO_SECONDS);
-    Future future = runtime->issue_timing_measurement(ctx, timer);
-    future.get_void_result();
-  }
-  double ts_end = Realm::Clock::current_time_in_microseconds();
-  double run_time = 1e-6 * (ts_end - ts_start);
-  printf("ELAPSED TIME = %.4fs, THROUGHPUT = %.2f samples/s\n",
-         run_time,
-         128 * ffConfig.batchSize * ffConfig.epochs / run_time);
-}
-
-size_t get_file_size(std::string const &filename) {
-  streampos begin, end;
-  ifstream file(filename.c_str(), ios::binary);
-  begin = file.tellg();
-  file.seekg(0, ios::end);
-  end = file.tellg();
-  file.close();
-  size_t filesize = end - begin;
-  return filesize;
-}
-
-/* DataLoader::DataLoader(FFModel& ff, */
-/*                        const ResNetConfig& resnet, */
-/*                        Tensor input, Tensor label) */
-/* { */
-/*   Context ctx = ff.config.lg_ctx; */
-/*   Runtime* runtime = ff.config.lg_hlr; */
-/*   num_samples = 0; */
-/*   if (resnet.dataset_path == "") { */
-/*     log_app.print("Use random dataset..."); */
-/*     num_samples = 10 * ff.config.batchSize * ff.config.workersPerNode *
- * ff.config.numNodes; */
-/*     log_app.print("Number of random samples = %d\n", num_samples); */
-/*   } else { */
-/*     log_app.print("Start loading dataset from %s",
- * resnet.dataset_path.c_str()); */
-/*     size_t filesize = get_file_size(resnet.dataset_path); */
-/*     assert(filesize % (3 * 360 * 360 + 1) == 0); */
-/*     num_samples = filesize / (3 * 360 * 360 + 1); */
-/*   } */
-/*   // Create full input */
-/*   { */
-/*     batch_input = input; */
-/*     const int dims[] = {num_samples, input->dims[2].size,
- * input->dims[1].size, input->dims[0].size}; */
-/*     full_input = ff.create_tensor<4>(dims, DT_FLOAT); */
-/*   } */
-/*   // Create full label */
-/*   { */
-/*     batch_label = label; */
-/*     const int dims[] = {num_samples, label->dims[0].size}; */
-/*     full_label = ff.create_tensor<2>(dims, DT_INT32); */
-/*   } */
-/*   // Load entire dataset */
-/*   // TODO: Use index launcher instead of task launcher */
-/*   const ResNetConfig* ptr = &resnet; */
-/*   TaskLauncher launcher(CUSTOM_CPU_TASK_ID_1, */
-/*       TaskArgument(&ptr, sizeof(ResNetConfig*))); */
-/*   // regions[0]: full_input */
-/*   launcher.add_region_requirement( */
-/*       RegionRequirement(full_input->region, WRITE_ONLY, */
-/*                         EXCLUSIVE, full_input->region, */
-/*                         MAP_TO_ZC_MEMORY)); */
-/*   launcher.add_field(0, FID_DATA); */
-/*   // regions[1]: full_label */
-/*   launcher.add_region_requirement( */
-/*       RegionRequirement(full_label->region, WRITE_ONLY, */
-/*                         EXCLUSIVE, full_label->region, */
-/*                         MAP_TO_ZC_MEMORY)); */
-/*   launcher.add_field(1, FID_DATA); */
-/*   runtime->execute_task(ctx, launcher); */
-/*   reset(); */
-/*   next_batch(ff); */
-/* } */
-
-__inline__ int calc_offset(int c, int y, int x, int yscale, int xscale) {
-  return (c * yscale * xscale + y * xscale + x);
-}
-
-void nearest_neigh(unsigned char *image,
-                   unsigned char *buffer,
-                   int height,
-                   int width,
-                   int orig_height,
-                   int orig_width,
-                   float height_scale,
-                   float width_scale) {
-  for (int y = 0; y < height; y++) {
-    int y0 =
-        std::min(static_cast<int>(roundf(y * height_scale)), orig_height - 1);
-    for (int x = 0; x < width; x++) {
-      int x0 =
-          std::min(static_cast<int>(roundf(x * width_scale)), orig_width - 1);
-      for (int c = 0; c < 3; c++) {
-        int origOffset = calc_offset(y0, x0, c, orig_width, 3);
-        int offset = calc_offset(c, y, x, height, width);
-        image[offset] = buffer[origOffset];
-      }
-    }
-  }
-}
-
-/* void DataLoader::load_entire_dataset(const Task *task, */
-/*   const std::vector<PhysicalRegion> &regions, */
-/*   Context ctx, Runtime* runtime) */
-/* { */
-/*   const ResNetConfig* resnet = *((ResNetConfig**)task->args); */
-/*   assert(regions.size() == 2); */
-/*   assert(task->regions.size() == regions.size()); */
-/*   const AccessorWO<float, 4> acc_input(regions[0], FID_DATA); */
-/*   const AccessorWO<int, 2> acc_label(regions[1], FID_DATA); */
-/*   Rect<4> rect_input = runtime->get_index_space_domain( */
-/*       ctx, task->regions[0].region.get_index_space()); */
-/*   assert(acc_input.accessor.is_dense_arbitrary(rect_input)); */
-/*   Rect<2> rect_label = runtime->get_index_space_domain( */
-/*       ctx, task->regions[1].region.get_index_space()); */
-/*   assert(acc_label.accessor.is_dense_arbitrary(rect_label)); */
-/*   float* input_ptr = acc_input.ptr(rect_input.lo); */
-/*   int* label_ptr = acc_label.ptr(rect_label.lo); */
-/*   int num_samples = rect_label.hi[1] - rect_label.lo[1] + 1; */
-/*   assert(rect_input.hi[3] - rect_input.lo[3] + 1 == num_samples); */
-/*   if (resnet->dataset_path.length() == 0) { */
-/*     log_app.print("Start generating random input samples"); */
-/*     for (size_t i = 0; i < rect_label.volume(); i++) */
-/*       label_ptr[i] = std::rand() % 10; */
-/*     return; */
-/*   } */
-/*   log_app.print("Start loading %d samples from %s\n", */
-/*       num_samples, resnet->dataset_path.c_str()); */
-/*   int height = rect_input.hi[1] - rect_input.lo[1] + 1; */
-/*   int width = rect_input.hi[0] - rect_input.lo[0] + 1; */
-/*   int origHeight = 360; */
-/*   int origWidth = 360; */
-/*   float heightScale = static_cast<float>(origHeight) / height; */
-/*   float widthScale = static_cast<float>(origWidth) / width; */
-/*   FILE* file = fopen(resnet->dataset_path.c_str(), "rb"); */
-/*   unsigned char* buffer = (unsigned char*) malloc(3 * 360 * 360 + 1); */
-/*   unsigned char* image = (unsigned char*) malloc(3 * height * width); */
-/*   for (off_t i = 0; i < num_samples; i++) { */
-/*     size_t ret = fread(buffer, sizeof(unsigned char), 3 * 360 * 360 + 1,
- * file); */
-/*     assert(ret = 3 * 360 * 360 + 1); */
-/*     if ((i+1) % 1000 == 0) */
-/*       log_app.print("Loaded %ld samples", i+1); */
-/*     label_ptr[i] = buffer[0]; */
-/*     nearest_neigh(image, buffer + 1, height, width, */
-/*                   origHeight, origWidth, heightScale, widthScale); */
-/*     off_t input_offset = i * 3 * height * width; */
-/*     off_t image_offset = 0; */
-/*     for (off_t h = 0; h < 3*height*width; h++) */
-/*         input_ptr[input_offset++] = static_cast<float>(image[image_offset++])
- * / 255; */
-/*   } */
-/*   log_app.print("Finish loading %d samples from %s\n", */
-/*       num_samples, resnet->dataset_path.c_str()); */
-/*   fclose(file); */
-/* } */
-
-/* void DataLoader::next_batch(FFModel& ff) */
-/* { */
-/*   Context ctx = ff.config.lg_ctx; */
-/*   Runtime* runtime = ff.config.lg_hlr; */
-/*   // Load input */
-/*   { */
-/*     IndexSpaceT<4> task_is = IndexSpaceT<4>(ff.get_or_create_task_is(4, ""));
- */
-/*     Rect<4> rect = runtime->get_index_space_domain(ctx, task_is); */
-/*     ArgumentMap argmap; */
-/*     int idx = next_index; */
-/*     for (PointInRectIterator<4> it(rect); it(); it++) { */
-/*       SampleIdxs meta; */
-/*       assert(ff.config.batchSize % (rect.hi[3] - rect.lo[3] + 1) == 0); */
-/*       meta.num_samples = ff.config.batchSize / (rect.hi[3] - rect.lo[3] + 1);
- */
-/*       for (int i = 0; i < meta.num_samples; i++) */
-/*         meta.idxs[i] = idx++; */
-/*       argmap.set_point(*it, TaskArgument(&meta, sizeof(SampleIdxs))); */
-/*     } */
-/*     IndexLauncher launcher(CUSTOM_GPU_TASK_ID_1, task_is, */
-/*                            TaskArgument(NULL,0), argmap, */
-//                            Predicate::TRUE_PRED, false/*must*/,
-//                            0/*mapper_id*/, */
-/*                            FFConfig::get_hash_id("")); */
-/*     launcher.add_region_requirement( */
-//         RegionRequirement(full_input->region, 0/*projection id*/, */
-/*                           READ_ONLY, EXCLUSIVE, full_input->region, */
-/*                           MAP_TO_ZC_MEMORY)); */
-/*     launcher.add_field(0, FID_DATA); */
-/*     launcher.add_region_requirement( */
-//         RegionRequirement(batch_input->part, 0/*projection id*/, */
-/*                           WRITE_ONLY, EXCLUSIVE, batch_input->region)); */
-/*     launcher.add_field(1, FID_DATA); */
-/*     runtime->execute_index_space(ctx, launcher); */
-/*   } */
-/*   // Load label */
-/*   { */
-/*     IndexSpaceT<2> task_is = IndexSpaceT<2>(ff.get_or_create_task_is(2, ""));
- */
-/*     Rect<2> rect = runtime->get_index_space_domain(ctx, task_is); */
-/*     ArgumentMap argmap; */
-/*     int idx = next_index; */
-/*     for (PointInRectIterator<2> it(rect); it(); it++) { */
-/*       SampleIdxs meta; */
-/*       assert(ff.config.batchSize % (rect.hi[1] - rect.lo[1] + 1) == 0); */
-/*       meta.num_samples = ff.config.batchSize / (rect.hi[1] - rect.lo[1] + 1);
- */
-/*       for (int i = 0; i < meta.num_samples; i++) */
-/*         meta.idxs[i] = idx++; */
-/*       argmap.set_point(*it, TaskArgument(&meta, sizeof(SampleIdxs))); */
-/*     } */
-/*     IndexLauncher launcher(CUSTOM_GPU_TASK_ID_2, task_is, */
-/*                            TaskArgument(NULL,0), argmap, */
-//                            Predicate::TRUE_PRED, false/*must*/,
-//                            0/*mapper_id*/, */
-/*                            FFConfig::get_hash_id("")); */
-/*     launcher.add_region_requirement( */
-//         RegionRequirement(full_label->region, 0/*projection id*/, */
-/*                           READ_ONLY, EXCLUSIVE, full_label->region, */
-/*                           MAP_TO_ZC_MEMORY)); */
-/*     launcher.add_field(0, FID_DATA); */
-/*     launcher.add_region_requirement( */
-//         RegionRequirement(batch_label->part, 0/*projection id*/, */
-/*                           WRITE_ONLY, EXCLUSIVE, batch_label->region)); */
-/*     launcher.add_field(1, FID_DATA); */
-/*     runtime->execute_index_space(ctx, launcher); */
-/*   } */
-/*   next_index += ff.config.batchSize; */
-/* } */
-
-/* void DataLoader::reset() */
-/* { */
-/*   next_index = 0; */
-/* } */
-
-void FlexFlow::register_custom_tasks() {}
-
-/* void register_custom_tasks() */
-/* { */
-/*   // Load entire dataset */
-/*   { */
-/*     TaskVariantRegistrar registrar(CUSTOM_CPU_TASK_ID_1, "Load Entire
- * Dataset"); */
-/*     registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC)); */
-/*     registrar.set_leaf(); */
-/*     Runtime::preregister_task_variant<DataLoader::load_entire_dataset>( */
-/*         registrar, "Load Entire Dataset Task"); */
-/*   } */
-/*   // Load input */
-/*   { */
-/*     TaskVariantRegistrar registrar(CUSTOM_GPU_TASK_ID_1, "Load Inputs"); */
-/*     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); */
-/*     registrar.set_leaf(); */
-/*     Runtime::preregister_task_variant<DataLoader::load_input>( */
-/*         registrar, "Load Input Task"); */
-/*   } */
-/*   // Load label */
-/*   { */
-/*     TaskVariantRegistrar registrar(CUSTOM_GPU_TASK_ID_2, "Load Labels"); */
-/*     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); */
-/*     registrar.set_leaf(); */
-/*     Runtime::preregister_task_variant<DataLoader::load_label>( */
-/*         registrar, "Load Label Task"); */
-/*   } */
-/* } */
diff --git a/examples/cpp/ResNet/resnet.cu b/examples/cpp/ResNet/resnet.cu
deleted file mode 100644
index 9346b69536..0000000000
--- a/examples/cpp/ResNet/resnet.cu
+++ /dev/null
@@ -1,83 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "flexflow/utils/cuda_helper.h"
-#include "resnet.h"
-
-using FlexFlow::FFModel;
-using FlexFlow::Tensor;
-using FlexFlow::TensorAccessorR;
-using FlexFlow::TensorAccessorW;
-
-void DataLoader::load_input(Task const *task,
-                            std::vector<PhysicalRegion> const &regions,
-                            Context ctx,
-                            Runtime *runtime) {
-  assert(regions.size() == 2);
-  assert(task->regions.size() == 2);
-  SampleIdxs *meta = (SampleIdxs *)task->local_args;
-  TensorAccessorR<float, 5> acc_full_input(
-      regions[0], task->regions[0], FlexFlow::FID_DATA, ctx, runtime);
-  TensorAccessorW<float, 5> acc_batch_input(regions[1],
-                                            task->regions[1],
-                                            FlexFlow::FID_DATA,
-                                            ctx,
-                                            runtime,
-                                            false /*readOutput*/);
-  coord_t batch_size =
-      acc_batch_input.rect.hi[3] - acc_batch_input.rect.lo[3] + 1;
-  coord_t channels =
-      acc_batch_input.rect.hi[2] - acc_batch_input.rect.lo[2] + 1;
-  coord_t height = acc_batch_input.rect.hi[1] - acc_batch_input.rect.lo[1] + 1;
-  coord_t width = acc_batch_input.rect.hi[0] - acc_batch_input.rect.lo[0] + 1;
-  // FIXME: currently assume continous indices
-  assert(batch_size == meta->num_samples);
-  for (int i = 1; i < batch_size; i++) {
-    assert(meta->idxs[i] == meta->idxs[0] + i);
-  }
-  coord_t start_idx = meta->idxs[0];
-  float const *input_zc =
-      acc_full_input.ptr + start_idx * channels * height * width;
-  copy_kernel<<<GET_BLOCKS(acc_batch_input.rect.volume()), CUDA_NUM_THREADS>>>(
-      acc_batch_input.ptr, input_zc, acc_batch_input.rect.volume());
-  checkCUDA(cudaDeviceSynchronize());
-}
-
-void DataLoader::load_label(Task const *task,
-                            std::vector<PhysicalRegion> const &regions,
-                            Context ctx,
-                            Runtime *runtime) {
-  assert(regions.size() == 2);
-  assert(task->regions.size() == 2);
-  SampleIdxs *meta = (SampleIdxs *)task->local_args;
-  TensorAccessorR<int, 3> acc_full_label(
-      regions[0], task->regions[0], FlexFlow::FID_DATA, ctx, runtime);
-  TensorAccessorW<int, 3> acc_batch_label(regions[1],
-                                          task->regions[1],
-                                          FlexFlow::FID_DATA,
-                                          ctx,
-                                          runtime,
-                                          false /*readOutput*/);
-  int batch_size = acc_batch_label.rect.hi[1] - acc_batch_label.rect.lo[1] + 1;
-  // FIXME: currently assume continous indices
-  assert(batch_size == meta->num_samples);
-  for (int i = 1; i < batch_size; i++) {
-    assert(meta->idxs[i] == meta->idxs[0] + i);
-  }
-  int const *input_zc = acc_full_label.ptr + meta->idxs[0];
-  copy_kernel<<<GET_BLOCKS(acc_batch_label.rect.volume()), CUDA_NUM_THREADS>>>(
-      acc_batch_label.ptr, input_zc, acc_batch_label.rect.volume());
-  checkCUDA(cudaDeviceSynchronize());
-}
diff --git a/examples/cpp/ResNet/resnet.h b/examples/cpp/ResNet/resnet.h
deleted file mode 100644
index a7852afb44..0000000000
--- a/examples/cpp/ResNet/resnet.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "flexflow/model.h"
-#define MAX_NUM_SAMPLES 4196
-
-using namespace Legion;
-using namespace std;
-
-struct ResNetConfig {
-  ResNetConfig(void) {
-    // Set default configurations here
-  }
-  std::string dataset_path;
-};
-
-class DataLoader {
-public:
-  DataLoader(FlexFlow::FFModel &ff,
-             ResNetConfig const &resnet,
-             FlexFlow::Tensor _input,
-             FlexFlow::Tensor _label);
-  static void load_input(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime);
-  static void load_label(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime);
-  static void load_entire_dataset(Task const *task,
-                                  std::vector<PhysicalRegion> const &regions,
-                                  Context ctx,
-                                  Runtime *runtime);
-  void next_batch(FlexFlow::FFModel &);
-  void reset(void);
-
-public:
-  int num_samples, next_index;
-  FlexFlow::Tensor full_input, batch_input;
-  FlexFlow::Tensor full_label, batch_label;
-};
-
-struct SampleIdxs {
-  int num_samples;
-  int idxs[MAX_NUM_SAMPLES];
-};
diff --git a/examples/cpp/Transformer/CMakeLists.txt b/examples/cpp/Transformer/CMakeLists.txt
deleted file mode 100644
index ac46d77f32..0000000000
--- a/examples/cpp/Transformer/CMakeLists.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-cmake_minimum_required(VERSION 3.10)
-
-project(FlexFlowExample_Transformer)
-set(project_target transformer)
-
-set(CPU_SRC
-  ${FLEXFLOW_CPP_DRV_SRC}
-  transformer.cc
-  transformer.h)
-
-set(GPU_SRC
-transformer.cu)
-
-cuda_add_executable(${project_target} ${CPU_SRC} ${GPU_SRC})
-target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR})
-target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES})
-
-set(BIN_DEST "bin")
-install(TARGETS ${project_target} DESTINATION ${BIN_DEST})
diff --git a/examples/cpp/Transformer/Makefile b/examples/cpp/Transformer/Makefile
deleted file mode 100644
index eb8c572cb5..0000000000
--- a/examples/cpp/Transformer/Makefile
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Flags for directing the runtime makefile what to include
-DEBUG           ?= 1		# Include debugging symbols
-MAX_DIM         ?= 4		# Maximum number of dimensions
-OUTPUT_LEVEL    ?= LEVEL_DEBUG	# Compile time logging level
-USE_CUDA        ?= 1		# Include CUDA support (requires CUDA)
-USE_GASNET      ?= 0		# Include GASNet support (requires GASNet)
-USE_HDF         ?= 1		# Include HDF5 support (requires HDF5)
-ALT_MAPPERS     ?= 0		# Include alternative mappers (not recommended)
-
-# Put the binary file name here
-OUTFILE		?= transformer
-# List all the application source files here
-GEN_SRC		= transformer.cc
-GEN_GPU_SRC	= transformer.cu
-
-ifndef FF_HOME
-$(error FF_HOME variable is not defined, aborting build)
-endif
-
-include $(FF_HOME)/FlexFlow.mk
diff --git a/examples/cpp/Transformer/transformer.cc b/examples/cpp/Transformer/transformer.cc
deleted file mode 100644
index 66cf9cc55f..0000000000
--- a/examples/cpp/Transformer/transformer.cc
+++ /dev/null
@@ -1,401 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "transformer.h"
-
-using namespace Legion;
-
-LegionRuntime::Logger::Category log_app("Transformer");
-
-Tensor create_emb(FFModel *model,
-                  Tensor const &input,
-                  int input_dim,
-                  int output_dim,
-                  int idx) {
-  float range = sqrt(1.0f / input_dim);
-  Initializer *embed_init = new UniformInitializer(std::rand(), -range, range);
-  return model->embedding(
-      input, input_dim, output_dim, AGGR_MODE_SUM, DT_FLOAT, NULL, embed_init);
-}
-
-Tensor create_attention_encoder(FFModel *model,
-                                Tensor const &input,
-                                int hidden_dim,
-                                int num_heads,
-                                int kdim,
-                                int vdim) {
-  Tensor t = model->multihead_attention(
-      input, input, input, hidden_dim, num_heads, kdim, vdim);
-  return model->dense(model->dense(t, hidden_dim, AC_MODE_RELU, false /*bias*/),
-                      hidden_dim,
-                      AC_MODE_NONE,
-                      false /*bias*/);
-}
-
-void create_attention_encoder_decoder(FFModel *model,
-                                      Tensor const &input1,
-                                      Tensor const &input2,
-                                      Tensor &output1,
-                                      Tensor &output2,
-                                      int hidden_dim,
-                                      int num_heads,
-                                      int kdim,
-                                      int vdim) {
-  Tensor t1 =
-      model->add(model->multihead_attention(
-                     input1, input1, input1, hidden_dim, num_heads, kdim, vdim),
-                 input1);
-  t1 = model->dense(model->dense(t1, hidden_dim, AC_MODE_RELU, false /*bias*/),
-                    hidden_dim,
-                    AC_MODE_NONE,
-                    false /*bias*/);
-  Tensor t2 =
-      model->add(model->multihead_attention(
-                     input2, input2, input2, hidden_dim, num_heads, kdim, vdim),
-                 input2);
-  t2 = model->add(
-      model->multihead_attention(t2, t1, t1, hidden_dim, num_heads, kdim, vdim),
-      t2);
-  t2 = model->dense(model->dense(t2, hidden_dim, AC_MODE_RELU, false /*bias*/),
-                    hidden_dim,
-                    AC_MODE_NONE,
-                    false /*bias*/);
-  output1 = t1;
-  output2 = t2;
-}
-
-TransformerConfig::TransformerConfig(void) {
-  hidden_size = 1024;
-  embedding_size = 1024;
-  num_heads = 16;
-  num_layers = 12;
-  sequence_length = 512;
-}
-
-void parse_input_args(char **argv, int argc, TransformerConfig &config) {
-  for (int i = 1; i < argc; i++) {
-    if (!strcmp(argv[i], "--num-layers")) {
-      config.num_layers = atoi(argv[++i]);
-      continue;
-    }
-    if (!strcmp(argv[i], "--embedding-size")) {
-      config.embedding_size = atoi(argv[++i]);
-      continue;
-    }
-    if (!strcmp(argv[i], "--hidden-size")) {
-      config.hidden_size = atoi(argv[++i]);
-      continue;
-    }
-    if (!strcmp(argv[i], "--num-heads")) {
-      config.num_heads = atoi(argv[++i]);
-      continue;
-    }
-    if (!strcmp(argv[i], "--sequence-length")) {
-      config.sequence_length = atoi(argv[++i]);
-      continue;
-    }
-  }
-}
-
-void FlexFlow::top_level_task(Task const *task,
-                              std::vector<PhysicalRegion> const &regions,
-                              Context ctx,
-                              Runtime *runtime) {
-  FFConfig ffConfig;
-  TransformerConfig tfConfig;
-  {
-    InputArgs const &command_args = HighLevelRuntime::get_input_args();
-    char **argv = command_args.argv;
-    int argc = command_args.argc;
-    parse_input_args(argv, argc, tfConfig);
-    log_app.print("batchSize(%d) workersPerNodes(%d) numNodes(%d)",
-                  ffConfig.batchSize,
-                  ffConfig.workersPerNode,
-                  ffConfig.numNodes);
-    log_app.print("Hidden Size(%d)", tfConfig.hidden_size);
-    log_app.print("Embedding Vocab Size(%d)", tfConfig.embedding_size);
-    log_app.print("Number of Heads(%d)", tfConfig.num_heads);
-    log_app.print("Number of Layers(%d)", tfConfig.num_layers);
-    log_app.print("Sequence Length(%d)", tfConfig.sequence_length);
-  }
-  FFModel ff(ffConfig);
-  Tensor input;
-  {
-    int const dims[] = {
-        ffConfig.batchSize, tfConfig.sequence_length, tfConfig.hidden_size};
-    input = ff.create_tensor<3>(dims, DT_FLOAT);
-  }
-  // Tensor t = create_emb(&ff, input, tfConfig.embedding_size,
-  // tfConfig.hidden_size); Tensor input1 = input, input2 = input; Tensor t1,
-  // t2;
-  Tensor t = input;
-  for (int i = 0; i < tfConfig.num_layers; i++) {
-    t = create_attention_encoder(&ff,
-                                 t,
-                                 tfConfig.hidden_size,
-                                 tfConfig.num_heads,
-                                 tfConfig.hidden_size / tfConfig.num_heads,
-                                 tfConfig.hidden_size / tfConfig.num_heads);
-    // create_attention_encoder_decoder(&ff, input1, input2, t1, t2,
-    //     tfConfig.hidden_size, tfConfig.num_heads,
-    //     tfConfig.hidden_size / tfConfig.num_heads,
-    //     tfConfig.hidden_size / tfConfig.num_heads);
-    // input1 = t1;
-    // input2 = t2;
-  }
-  t = ff.dense(t, 1, AC_MODE_NONE, false /*bias*/);
-  Optimizer *optimizer = new SGDOptimizer(&ff, 0.01f);
-  std::vector<MetricsType> metrics;
-  // metrics.push_back(METRICS_ACCURACY);
-  // metrics.push_back(METRICS_MEAN_SQUARED_ERROR);
-  ff.compile(optimizer, LOSS_MEAN_SQUARED_ERROR_AVG_REDUCE, metrics);
-  // Data Loader
-  DataLoader loader(ff, tfConfig, input, ff.label_tensor);
-  loader.next_batch(ff);
-  loader.reset();
-  ff.init_operators();
-
-  // Start timer
-  {
-    runtime->issue_execution_fence(ctx);
-    TimingLauncher timer(MEASURE_MICRO_SECONDS);
-    Future future = runtime->issue_timing_measurement(ctx, timer);
-    future.get_void_result();
-  }
-  log_app.print("Warmup finished...Start timer...");
-  log_app.print("Num. epochs = %d", ffConfig.epochs);
-  log_app.print("Num. iterations/epoch = %d",
-                loader.num_samples / ffConfig.batchSize);
-  printf("parameters.size() = %lu\n", ff.parameters.size());
-  double ts_start = Realm::Clock::current_time_in_microseconds();
-  for (int epoch = 0; epoch < ffConfig.epochs; epoch++) {
-    loader.reset();
-    ff.reset_metrics();
-    int iterations = loader.num_samples / ffConfig.batchSize;
-    for (int iter = 0; iter < iterations; iter++) {
-      // Only load data once for random input
-      if (iter == 0 && epoch == 0) {
-        loader.next_batch(ff);
-      }
-      runtime->begin_trace(ctx, 111 /*trace_id*/);
-      ff.forward();
-      ff.zero_gradients();
-      ff.backward();
-      ff.update();
-      runtime->end_trace(ctx, 111 /*trace_id*/);
-    }
-  }
-  // End timer
-  {
-    runtime->issue_execution_fence(ctx);
-    TimingLauncher timer(MEASURE_MICRO_SECONDS);
-    Future future = runtime->issue_timing_measurement(ctx, timer);
-    future.get_void_result();
-  }
-  double ts_end = Realm::Clock::current_time_in_microseconds();
-  double run_time = 1e-6 * (ts_end - ts_start);
-  printf("ELAPSED TIME = %.4fs, THROUGHPUT = %.2f samples/s\n",
-         run_time,
-         loader.num_samples * ffConfig.epochs / run_time);
-}
-
-DataLoader::DataLoader(FFModel &ff,
-                       TransformerConfig const &tf,
-                       Tensor const &_input,
-                       Tensor const &_label) {
-  Context ctx = ff.config.lg_ctx;
-  Runtime *runtime = ff.config.lg_hlr;
-  num_samples = 0;
-  log_app.print("Use random dataset...");
-  num_samples =
-      ff.config.batchSize * ff.config.workersPerNode * ff.config.numNodes;
-  log_app.print("Number of random samples = %d\n", num_samples);
-  return;
-  {
-    batch_input = _input;
-    int const dims[] = {num_samples, tf.sequence_length, tf.hidden_size};
-    full_input = ff.create_tensor<3>(dims, DT_FLOAT);
-  }
-  {
-    batch_label = _label;
-    int const dims[] = {num_samples, tf.sequence_length, 1};
-    full_label = ff.create_tensor<3>(dims, DT_FLOAT);
-  }
-  // Load entire dataset
-  // TODO: Use index launcher instead of task launcher
-  TaskLauncher launcher(CUSTOM_CPU_TASK_ID_1, TaskArgument(NULL, 0));
-  // regions[0]: full_sparse_input
-  launcher.add_region_requirement(
-      RegionRequirement(full_input->parallel_tensor->region,
-                        WRITE_ONLY,
-                        EXCLUSIVE,
-                        full_input->parallel_tensor->region,
-                        MAP_TO_FB_MEMORY));
-  launcher.add_field(0, FID_DATA);
-  // regions[1]: full_label
-  launcher.add_region_requirement(
-      RegionRequirement(full_label->parallel_tensor->region,
-                        WRITE_ONLY,
-                        EXCLUSIVE,
-                        full_label->parallel_tensor->region,
-                        MAP_TO_ZC_MEMORY));
-  launcher.add_field(1, FID_DATA);
-  runtime->execute_task(ctx, launcher);
-}
-
-void DataLoader::load_entire_dataset(Task const *task,
-                                     std::vector<PhysicalRegion> const &regions,
-                                     Context ctx,
-                                     Runtime *runtime) {
-  assert(regions.size() == 2);
-  assert(task->regions.size() == 2);
-  // Note that these instances are in ZCM, can only use
-  // TensorAccessorW with readOutput flag
-  AccessorWO<float, 3> const acc_input(regions[0], FID_DATA);
-  AccessorWO<float, 3> const acc_label(regions[1], FID_DATA);
-  Rect<3> rect_input = runtime->get_index_space_domain(
-      ctx, task->regions[0].region.get_index_space());
-  Rect<3> rect_label = runtime->get_index_space_domain(
-      ctx, task->regions[1].region.get_index_space());
-  assert(acc_input.accessor.is_dense_arbitrary(rect_input));
-  assert(acc_label.accessor.is_dense_arbitrary(rect_label));
-  float *input_ptr = acc_input.ptr(rect_input.lo);
-  float *label_ptr = acc_label.ptr(rect_label.lo);
-  // assert(rect_input == rect_label);
-
-  for (size_t i = 0; i < rect_input.volume(); i++) {
-    input_ptr[i] = ((float)std::rand()) / RAND_MAX;
-  }
-  for (size_t i = 0; i < rect_label.volume(); i++) {
-    label_ptr[i] = std::rand() % 2;
-  }
-}
-
-void DataLoader::next_batch(FFModel &ff) {
-  return;
-  Context ctx = ff.config.lg_ctx;
-  Runtime *runtime = ff.config.lg_hlr;
-  // Load Input
-  {
-    Domain domain = runtime->get_index_space_domain(
-        ctx, batch_input->parallel_tensor->parallel_is);
-    ArgumentMap argmap;
-    int idx = next_index;
-    for (Domain::DomainPointIterator it(domain); it; it++) {
-      SampleIdxs meta;
-      assert(ff.config.batchSize % batch_input->parallel_tensor->dims[2].size ==
-             0);
-      meta.num_samples =
-          ff.config.batchSize / batch_input->parallel_tensor->dims[2].size;
-      for (int i = 0; i < meta.num_samples; i++) {
-        meta.idxs[i] = idx++;
-      }
-      argmap.set_point(*it, TaskArgument(&meta, sizeof(SampleIdxs)));
-    }
-    IndexLauncher launcher(CUSTOM_GPU_TASK_ID_2,
-                           batch_input->parallel_tensor->parallel_is,
-                           TaskArgument(NULL, 0),
-                           argmap,
-                           Predicate::TRUE_PRED,
-                           false /*must*/,
-                           0 /*mapper_id*/,
-                           batch_input->parallel_tensor->machine_view.hash());
-    // Full dataset in ZCM
-    launcher.add_region_requirement(
-        RegionRequirement(full_input->parallel_tensor->region,
-                          0 /*projection id*/,
-                          READ_ONLY,
-                          EXCLUSIVE,
-                          full_input->parallel_tensor->region,
-                          MAP_TO_ZC_MEMORY));
-    launcher.add_field(0, FID_DATA);
-    launcher.add_region_requirement(
-        RegionRequirement(batch_input->parallel_tensor->part,
-                          0 /*projection id*/,
-                          WRITE_ONLY,
-                          EXCLUSIVE,
-                          batch_input->parallel_tensor->region));
-    launcher.add_field(1, FID_DATA);
-    runtime->execute_index_space(ctx, launcher);
-  }
-  // Load Labels
-  {
-    Domain domain = runtime->get_index_space_domain(
-        ctx, batch_label->parallel_tensor->parallel_is);
-    ArgumentMap argmap;
-    int idx = next_index;
-    for (Domain::DomainPointIterator it(domain); it; it++) {
-      SampleIdxs meta;
-      assert(ff.config.batchSize % batch_label->parallel_tensor->dims[2].size ==
-             0);
-      meta.num_samples =
-          ff.config.batchSize / batch_label->parallel_tensor->dims[2].size;
-      for (int i = 0; i < meta.num_samples; i++) {
-        meta.idxs[i] = idx++;
-      }
-      argmap.set_point(*it, TaskArgument(&meta, sizeof(SampleIdxs)));
-    }
-    IndexLauncher launcher(CUSTOM_GPU_TASK_ID_2,
-                           batch_label->parallel_tensor->parallel_is,
-                           TaskArgument(NULL, 0),
-                           argmap,
-                           Predicate::TRUE_PRED,
-                           false /*must*/,
-                           0 /*mapper_id*/,
-                           batch_label->parallel_tensor->machine_view.hash());
-    // Full dataset in ZCM
-    launcher.add_region_requirement(
-        RegionRequirement(full_label->parallel_tensor->region,
-                          0 /*projection id*/,
-                          READ_ONLY,
-                          EXCLUSIVE,
-                          full_label->parallel_tensor->region,
-                          MAP_TO_ZC_MEMORY));
-    launcher.add_field(0, FID_DATA);
-    launcher.add_region_requirement(
-        RegionRequirement(batch_label->parallel_tensor->part,
-                          0 /*projection id*/,
-                          WRITE_ONLY,
-                          EXCLUSIVE,
-                          batch_label->parallel_tensor->region));
-    launcher.add_field(1, FID_DATA);
-    runtime->execute_index_space(ctx, launcher);
-  }
-  // progress next_index
-  next_index += ff.config.batchSize;
-}
-
-void DataLoader::reset() {
-  next_index = 0;
-}
-
-void FlexFlow::register_custom_tasks() {
-  // Load entire dataset
-  {
-    TaskVariantRegistrar registrar(CUSTOM_CPU_TASK_ID_1, "Load Entire Dataset");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<DataLoader::load_entire_dataset>(
-        registrar, "Load Entire Dataset Task");
-  }
-  // Load input
-  {
-    TaskVariantRegistrar registrar(CUSTOM_GPU_TASK_ID_2, "Load Inputs");
-    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<DataLoader::load_input>(
-        registrar, "Load Inputs Task");
-  }
-}
diff --git a/examples/cpp/Transformer/transformer.cu b/examples/cpp/Transformer/transformer.cu
deleted file mode 100644
index 303553b839..0000000000
--- a/examples/cpp/Transformer/transformer.cu
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "flexflow/utils/cuda_helper.h"
-#include "transformer.h"
-
-void DataLoader::load_input(Task const *task,
-                            std::vector<PhysicalRegion> const &regions,
-                            Context ctx,
-                            Runtime *runtime) {
-  assert(regions.size() == 2);
-  assert(task->regions.size() == 2);
-  SampleIdxs *meta = (SampleIdxs *)task->local_args;
-  TensorAccessorR<float, 3> acc_full_input(
-      regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  TensorAccessorW<float, 3> acc_batch_input(regions[1],
-                                            task->regions[1],
-                                            FID_DATA,
-                                            ctx,
-                                            runtime,
-                                            false /*readOutput*/);
-  int batch_size = acc_batch_input.rect.hi[2] - acc_batch_input.rect.lo[2] + 1;
-  int embed_size = acc_batch_input.rect.hi[0] - acc_batch_input.rect.lo[0] + 1;
-  int seq_length = acc_batch_input.rect.hi[1] - acc_batch_input.rect.lo[1] + 1;
-  assert(acc_batch_input.rect.hi[0] == acc_full_input.rect.hi[0]);
-  assert(acc_batch_input.rect.lo[0] == acc_full_input.rect.lo[0]);
-  assert(acc_batch_input.rect.hi[1] == acc_full_input.rect.hi[1]);
-  assert(acc_batch_input.rect.lo[1] == acc_full_input.rect.lo[1]);
-
-  float *input_zc;
-  checkCUDA(cudaHostAlloc(&input_zc,
-                          sizeof(float) * acc_batch_input.rect.volume(),
-                          cudaHostAllocPortable | cudaHostAllocMapped));
-  assert(batch_size == meta->num_samples);
-  for (int i = 0; i < batch_size; i++) {
-    int base_offset = meta->idxs[i] * embed_size * seq_length;
-    for (int j = 0; j < embed_size * seq_length; j++) {
-      input_zc[i * embed_size * seq_length + j] =
-          acc_full_input.ptr[base_offset + j];
-    }
-  }
-  checkCUDA(cudaMemcpy(acc_batch_input.ptr,
-                       input_zc,
-                       sizeof(float) * acc_batch_input.rect.volume(),
-                       cudaMemcpyHostToDevice));
-  checkCUDA(cudaFreeHost(input_zc));
-}
diff --git a/examples/cpp/Transformer/transformer.h b/examples/cpp/Transformer/transformer.h
deleted file mode 100644
index e26ca9bf87..0000000000
--- a/examples/cpp/Transformer/transformer.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "flexflow/model.h"
-#define MAX_NUM_SAMPLES 65536
-
-using namespace Legion;
-using namespace FlexFlow;
-
-struct TransformerConfig {
-  TransformerConfig(void);
-  int hidden_size, embedding_size, num_heads, num_layers, sequence_length;
-};
-
-class DataLoader {
-public:
-  DataLoader(FFModel &ff,
-             TransformerConfig const &tf,
-             Tensor const &_input,
-             Tensor const &_label);
-  void next_batch(FFModel &ff);
-  void reset();
-  static void load_entire_dataset(Task const *task,
-                                  std::vector<PhysicalRegion> const &regions,
-                                  Context ctx,
-                                  Runtime *runtime);
-  static void load_input(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime);
-
-public:
-  int num_samples, next_index;
-
-private:
-  Tensor full_input, batch_input, full_label, batch_label;
-};
-
-struct SampleIdxs {
-  int num_samples;
-  int idxs[MAX_NUM_SAMPLES];
-};
diff --git a/examples/cpp/XDL/CMakeLists.txt b/examples/cpp/XDL/CMakeLists.txt
deleted file mode 100644
index 3c1638df3b..0000000000
--- a/examples/cpp/XDL/CMakeLists.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-cmake_minimum_required(VERSION 3.10)
-
-project(FlexFlowExample_XDL)
-set(project_target xdl)
-find_package(HDF5 REQUIRED COMPONENTS C)
-set(CPU_SRC
-  ${FLEXFLOW_CPP_DRV_SRC}
-  xdl.cc
-  xdl.h)
-
-set(GPU_SRC
-  xdl.cu)
-
-cuda_add_executable(${project_target} ${CPU_SRC} ${GPU_SRC})
-target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR} ${HDF5_C_INCLUDE_DIRS})
-target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES} ${HDF5_C_LIBRARIES})
-
-set(BIN_DEST "bin")
-install(TARGETS ${project_target} DESTINATION ${BIN_DEST})
diff --git a/examples/cpp/XDL/xdl.cc b/examples/cpp/XDL/xdl.cc
deleted file mode 100644
index de15870ab6..0000000000
--- a/examples/cpp/XDL/xdl.cc
+++ /dev/null
@@ -1,449 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "xdl.h"
-#include <sstream>
-
-using namespace Legion;
-
-LegionRuntime::Logger::Category log_app("XDL");
-
-void parse_input_args(char **argv, int argc, XDLConfig &apConfig);
-
-XDLConfig::XDLConfig(void)
-    : sparse_feature_size(64), embedding_bag_size(1), dataset_path(""),
-      data_size(-1) {
-  embedding_size.push_back(1000000);
-  embedding_size.push_back(1000000);
-  embedding_size.push_back(1000000);
-  embedding_size.push_back(1000000);
-  mlp_top.push_back(256);
-  mlp_top.push_back(256);
-  mlp_top.push_back(256);
-  mlp_top.push_back(2);
-}
-
-Tensor create_mlp(FFModel *model,
-                  Tensor const &input,
-                  std::vector<int> ln,
-                  int sigmoid_layer) {
-  Tensor t = input;
-  for (int i = 0; i < (int)(ln.size() - 1); i++) {
-    float std_dev = sqrt(2.0f / (ln[i + 1] + ln[i]));
-    Initializer *weight_init = new NormInitializer(std::rand(), 0, std_dev);
-    std_dev = sqrt(2.0f / ln[i + 1]);
-    Initializer *bias_init = new NormInitializer(std::rand(), 0, std_dev);
-    ActiMode activation = i == sigmoid_layer ? AC_MODE_SIGMOID : AC_MODE_RELU;
-    t = model->dense(t,
-                     ln[i + 1],
-                     activation,
-                     false /*bias*/,
-                     DT_FLOAT,
-                     NULL /*weight_sharing*/,
-                     weight_init,
-                     bias_init);
-  }
-  return t;
-}
-
-Tensor create_emb(FFModel *model,
-                  Tensor const &input,
-                  int input_dim,
-                  int output_dim,
-                  int idx) {
-  float range = sqrt(1.0f / input_dim);
-  Initializer *embed_init = new UniformInitializer(std::rand(), -range, range);
-  return model->embedding(input,
-                          input_dim,
-                          output_dim,
-                          AGGR_MODE_SUM,
-                          DT_FLOAT /*dtype*/,
-                          NULL /*weight_sharing*/,
-                          embed_init);
-}
-
-Tensor interact_features(FFModel *model, std::vector<Tensor> const &ly) {
-  Tensor *inputs = (Tensor *)malloc(sizeof(Tensor) * (ly.size()));
-  for (size_t i = 0; i < ly.size(); i++) {
-    inputs[i] = ly[i];
-  }
-  return model->concat(ly.size(), inputs, -1 /*axis*/);
-  free(inputs);
-}
-
-void print_vector(std::string const &name, std::vector<int> const &vector) {
-  std::ostringstream out;
-  for (size_t i = 0; i < vector.size() - 1; i++) {
-    out << vector[i] << " ";
-  }
-  if (vector.size() > 0) {
-    out << vector[vector.size() - 1];
-  }
-  log_app.print("%s: %s", name.c_str(), out.str().c_str());
-}
-
-void FlexFlow::top_level_task(Task const *task,
-                              std::vector<PhysicalRegion> const &regions,
-                              Context ctx,
-                              Runtime *runtime) {
-  FFConfig ffConfig;
-  // Parse input arguments
-  XDLConfig xdlConfig;
-  {
-    InputArgs const &command_args = HighLevelRuntime::get_input_args();
-    char **argv = command_args.argv;
-    int argc = command_args.argc;
-    parse_input_args(argv, argc, xdlConfig);
-    log_app.print("batchSize(%d) workersPerNodes(%d) numNodes(%d)",
-                  ffConfig.batchSize,
-                  ffConfig.workersPerNode,
-                  ffConfig.numNodes);
-    log_app.print("EmbeddingBagSize(%d)", xdlConfig.embedding_bag_size);
-    print_vector("Embedding Vocab Sizes", xdlConfig.embedding_size);
-    print_vector("MLP layers", xdlConfig.mlp_top);
-  }
-
-  FFModel ff(ffConfig);
-
-  std::vector<Tensor> sparse_inputs;
-  for (size_t i = 0; i < xdlConfig.embedding_size.size(); i++) {
-    int const dims[] = {ffConfig.batchSize, xdlConfig.embedding_bag_size};
-    Tensor input = ff.create_tensor<2>(dims, DT_INT64);
-    sparse_inputs.push_back(input);
-  }
-  // Step 1 create dense_mlp
-  std::vector<Tensor> ly;
-  for (size_t i = 0; i < xdlConfig.embedding_size.size(); i++) {
-    int input_dim = xdlConfig.embedding_size[i];
-    int output_dim = xdlConfig.sparse_feature_size;
-    ly.push_back(create_emb(&ff, sparse_inputs[i], input_dim, output_dim, i));
-  }
-  Tensor z = interact_features(&ff, ly);
-  Tensor p =
-      create_mlp(&ff, z, xdlConfig.mlp_top, xdlConfig.mlp_top.size() - 2);
-  // Use SGD Optimizer
-  Optimizer *optimizer = new SGDOptimizer(&ff, 0.01f);
-  std::vector<MetricsType> metrics;
-  // metrics.push_back(METRICS_ACCURACY);
-  // metrics.push_back(METRICS_MEAN_SQUARED_ERROR);
-  ff.compile(optimizer, LOSS_MEAN_SQUARED_ERROR_AVG_REDUCE, metrics);
-  // Data Loader
-  DataLoader data_loader(ff, xdlConfig, sparse_inputs, ff.label_tensor);
-  ff.init_operators();
-
-  // Warmup iterations
-  for (int iter = 0; iter < 1; iter++) {
-    data_loader.reset();
-    ff.reset_metrics();
-    data_loader.next_batch(ff);
-    ff.forward();
-    ff.zero_gradients();
-    ff.backward();
-    ff.update();
-  }
-
-  // Start timer
-  {
-    runtime->issue_execution_fence(ctx);
-    TimingLauncher timer(MEASURE_MICRO_SECONDS);
-    Future future = runtime->issue_timing_measurement(ctx, timer);
-    future.get_void_result();
-  }
-  log_app.print("Warmup finished...Start timer...");
-  log_app.print("Num. epochs = %d", ffConfig.epochs);
-  log_app.print("Num. iterations/epoch = %d",
-                data_loader.num_samples / ffConfig.batchSize);
-  printf("parameters.size() = %lu\n", ff.parameters.size());
-  double ts_start = Realm::Clock::current_time_in_microseconds();
-  for (int epoch = 0; epoch < ffConfig.epochs; epoch++) {
-    data_loader.reset();
-    ff.reset_metrics();
-    int iterations = data_loader.num_samples / ffConfig.batchSize;
-    for (int iter = 0; iter < iterations; iter++) {
-      if (xdlConfig.dataset_path.length() == 0) {
-        // Only load data once for random input
-        // if (iter == 0 && epoch == 0)
-        //  data_loader.next_batch(ff);
-      } else {
-        data_loader.next_batch(ff);
-      }
-      if (epoch > 0) {
-        runtime->begin_trace(ctx, 111 /*trace_id*/);
-      }
-      ff.forward();
-      ff.zero_gradients();
-      ff.backward();
-      ff.update();
-      if (epoch > 0) {
-        runtime->end_trace(ctx, 111 /*trace_id*/);
-      }
-    }
-  }
-  // End timer
-  {
-    runtime->issue_execution_fence(ctx);
-    TimingLauncher timer(MEASURE_MICRO_SECONDS);
-    Future future = runtime->issue_timing_measurement(ctx, timer);
-    future.get_void_result();
-  }
-  double ts_end = Realm::Clock::current_time_in_microseconds();
-  double run_time = 1e-6 * (ts_end - ts_start);
-  printf("ELAPSED TIME = %.4fs, THROUGHPUT = %.2f samples/s\n",
-         run_time,
-         data_loader.num_samples * ffConfig.epochs / run_time);
-}
-
-void parse_input_args(char **argv, int argc, XDLConfig &config) {
-  for (int i = 1; i < argc; i++) {
-    if (!strcmp(argv[i], "--arch-sparse-feature-size")) {
-      config.sparse_feature_size = atoi(argv[++i]);
-      continue;
-    }
-    if (!strcmp(argv[i], "--arch-embedding-size")) {
-      std::stringstream ss(std::string(argv[++i]));
-      std::string word;
-      config.embedding_size.clear();
-      while (std::getline(ss, word, '-')) {
-        config.embedding_size.push_back(std::stoi(word));
-      }
-      continue;
-    }
-    if (!strcmp(argv[i], "--embedding-bag-size")) {
-      config.embedding_bag_size = atoi(argv[++i]);
-      continue;
-    }
-    if (!strcmp(argv[i], "--arch-mlp")) {
-      std::stringstream ss(std::string(argv[++i]));
-      std::string word;
-      config.mlp_top.clear();
-      while (std::getline(ss, word, '-')) {
-        config.mlp_top.push_back(std::stoi(word));
-      }
-      continue;
-    }
-    if (!strcmp(argv[i], "--loss-threshold")) {
-      config.loss_threshold = atof(argv[++i]);
-      continue;
-    }
-    if (!strcmp(argv[i], "--dataset")) {
-      config.dataset_path = std::string(argv[++i]);
-      continue;
-    }
-    if (!strcmp(argv[i], "--data-size")) {
-      config.data_size = atoi(argv[++i]);
-      continue;
-    }
-  }
-}
-
-DataLoader::DataLoader(FFModel &ff,
-                       XDLConfig const &xdl,
-                       std::vector<Tensor> const &_sparse_inputs,
-                       Tensor _label) {
-  Context ctx = ff.config.lg_ctx;
-  Runtime *runtime = ff.config.lg_hlr;
-  num_samples = 0;
-  log_app.print("Use random dataset...");
-  if (xdl.data_size > 0) {
-    num_samples = xdl.data_size; // num_samples = 256 * 2 * 8 * 16;
-  } else {
-    num_samples = 256 * 4 * ff.config.workersPerNode * ff.config.numNodes;
-  }
-  // num_samples = 256 * 2 * 8 * 16;
-  log_app.print("Number of random samples = %d\n", num_samples);
-}
-
-void DataLoader::load_entire_dataset(Task const *task,
-                                     std::vector<PhysicalRegion> const &regions,
-                                     Context ctx,
-                                     Runtime *runtime) {
-  assert(regions.size() == 3);
-  assert(task->regions.size() == 3);
-  // Note that these instances are in ZCM, can only use
-  // TensorAccessorW with readOutput flag
-  AccessorWO<int64_t, 2> const acc_sparse_input(regions[0], FID_DATA);
-  AccessorWO<float, 2> const acc_dense_input(regions[1], FID_DATA);
-  AccessorWO<float, 2> const acc_label_input(regions[2], FID_DATA);
-  Rect<2> rect_sparse_input = runtime->get_index_space_domain(
-      ctx, task->regions[0].region.get_index_space());
-  Rect<2> rect_dense_input = runtime->get_index_space_domain(
-      ctx, task->regions[1].region.get_index_space());
-  Rect<2> rect_label_input = runtime->get_index_space_domain(
-      ctx, task->regions[2].region.get_index_space());
-  assert(acc_sparse_input.accessor.is_dense_arbitrary(rect_sparse_input));
-  assert(acc_dense_input.accessor.is_dense_arbitrary(rect_dense_input));
-  assert(acc_label_input.accessor.is_dense_arbitrary(rect_label_input));
-  int64_t *sparse_input_ptr = acc_sparse_input.ptr(rect_sparse_input.lo);
-  float *dense_input_ptr = acc_dense_input.ptr(rect_dense_input.lo);
-  float *label_input_ptr = acc_label_input.ptr(rect_label_input.lo);
-  int num_samples = rect_sparse_input.hi[1] - rect_sparse_input.lo[1] + 1;
-  int num_sparse_inputs = rect_sparse_input.hi[0] - rect_sparse_input.lo[0] + 1;
-  assert(num_samples == rect_dense_input.hi[1] - rect_dense_input.lo[1] + 1);
-  int num_dense_dims = rect_dense_input.hi[0] - rect_dense_input.lo[0] + 1;
-  assert(num_samples == rect_label_input.hi[1] - rect_label_input.lo[1] + 1);
-  assert(rect_label_input.hi[0] == rect_label_input.lo[0]);
-  const ArgsConfig xdl = *((ArgsConfig const *)task->args);
-  int const emb_size = xdl.embedding_size;
-  std::string file_name((char const *)xdl.dataset_path);
-  log_app.print("Start generating random input samples");
-  for (size_t i = 0; i < rect_sparse_input.volume(); i++) {
-    sparse_input_ptr[i] = std::rand() % emb_size;
-  }
-  for (size_t i = 0; i < rect_dense_input.volume(); i++) {
-    dense_input_ptr[i] = ((float)std::rand()) / RAND_MAX;
-  }
-  for (size_t i = 0; i < rect_label_input.volume(); i++) {
-    label_input_ptr[i] = std::rand() % 2;
-  }
-}
-
-void DataLoader::next_batch(FFModel &ff) {
-  return;
-  Context ctx = ff.config.lg_ctx;
-  Runtime *runtime = ff.config.lg_hlr;
-  // Load Sparse Inputs
-  for (size_t i = 0; i < batch_sparse_inputs.size(); i++) {
-    int hash = batch_sparse_inputs.size() * MAX_NUM_EMB + i;
-    Domain domain = runtime->get_index_space_domain(
-        ctx, batch_sparse_inputs[i]->parallel_tensor->parallel_is);
-    ArgumentMap argmap;
-    int idx = next_index;
-    for (Domain::DomainPointIterator it(domain); it; it++) {
-      SampleIdxs meta;
-      assert(ff.config.batchSize ==
-             batch_sparse_inputs[i]->parallel_tensor->dims[1].size);
-      meta.num_samples =
-          ff.config.batchSize /
-          batch_sparse_inputs[i]->parallel_tensor->dims[1].degree;
-      // Assert that we have enough slots to save the indices
-      assert(meta.num_samples <= MAX_NUM_SAMPLES);
-      for (int i = 0; i < meta.num_samples; i++) {
-        meta.idxs[i] = idx++;
-      }
-      argmap.set_point(*it, TaskArgument(&meta, sizeof(SampleIdxs)));
-    }
-    IndexLauncher launcher(
-        CUSTOM_GPU_TASK_ID_1,
-        batch_sparse_inputs[i]->parallel_tensor->parallel_is,
-        TaskArgument(&hash, sizeof(int)),
-        argmap,
-        Predicate::TRUE_PRED,
-        false /*must*/,
-        0 /*mapper_id*/,
-        batch_sparse_inputs[i]->parallel_tensor->machine_view.hash());
-    // Full dataset in ZCM
-    launcher.add_region_requirement(
-        RegionRequirement(full_sparse_input->parallel_tensor->region,
-                          0 /*projection id*/,
-                          READ_ONLY,
-                          EXCLUSIVE,
-                          full_sparse_input->parallel_tensor->region,
-                          MAP_TO_ZC_MEMORY));
-    launcher.add_field(0, FID_DATA);
-    launcher.add_region_requirement(
-        RegionRequirement(batch_sparse_inputs[i]->parallel_tensor->part,
-                          0 /*projection id*/,
-                          WRITE_ONLY,
-                          EXCLUSIVE,
-                          batch_sparse_inputs[i]->parallel_tensor->region));
-    launcher.add_field(1, FID_DATA);
-    runtime->execute_index_space(ctx, launcher);
-  }
-  // Load Labels
-  {
-    Domain domain = runtime->get_index_space_domain(
-        ctx, batch_label->parallel_tensor->parallel_is);
-    ArgumentMap argmap;
-    int idx = next_index;
-    for (Domain::DomainPointIterator it(domain); it; it++) {
-      SampleIdxs meta;
-      assert(ff.config.batchSize % batch_label->parallel_tensor->dims[1].size);
-      meta.num_samples =
-          ff.config.batchSize / batch_label->parallel_tensor->dims[1].degree;
-      for (int i = 0; i < meta.num_samples; i++) {
-        meta.idxs[i] = idx++;
-      }
-      argmap.set_point(*it, TaskArgument(&meta, sizeof(SampleIdxs)));
-    }
-    IndexLauncher launcher(CUSTOM_GPU_TASK_ID_3,
-                           batch_label->parallel_tensor->parallel_is,
-                           TaskArgument(NULL, 0),
-                           argmap,
-                           Predicate::TRUE_PRED,
-                           false /*must*/,
-                           0 /*mapper_id*/,
-                           batch_label->parallel_tensor->machine_view.hash());
-    // Full dataset in ZCM
-    launcher.add_region_requirement(
-        RegionRequirement(full_label->parallel_tensor->region,
-                          0 /*projection id*/,
-                          READ_ONLY,
-                          EXCLUSIVE,
-                          full_label->parallel_tensor->region,
-                          MAP_TO_ZC_MEMORY));
-    launcher.add_field(0, FID_DATA);
-    launcher.add_region_requirement(
-        RegionRequirement(batch_label->parallel_tensor->part,
-                          0 /*projection id*/,
-                          WRITE_ONLY,
-                          EXCLUSIVE,
-                          batch_label->parallel_tensor->region));
-    launcher.add_field(1, FID_DATA);
-    runtime->execute_index_space(ctx, launcher);
-  }
-  // progress next_index
-  next_index += ff.config.batchSize;
-}
-
-void DataLoader::shuffle() {}
-
-void DataLoader::reset() {
-  next_index = 0;
-}
-
-void DataLoader::load_sparse_input_cpu(
-    Task const *task,
-    std::vector<PhysicalRegion> const &regions,
-    Context ctx,
-    Runtime *runtime) {
-  std::cout << "load_sparse_input_cpu" << std::endl;
-}
-
-void FlexFlow::register_custom_tasks() {
-  // Load entire dataset
-  {
-    TaskVariantRegistrar registrar(CUSTOM_CPU_TASK_ID_1, "Load Entire Dataset");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<DataLoader::load_entire_dataset>(
-        registrar, "Load Entire Dataset Task");
-  }
-  // Load Sparse Inputs
-  {
-    TaskVariantRegistrar registrar(CUSTOM_GPU_TASK_ID_1, "Load Sparse Inputs");
-    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<DataLoader::load_sparse_input>(
-        registrar, "Load Sparse Inputs Task");
-  }
-  // Load Labels
-  {
-    TaskVariantRegistrar registrar(CUSTOM_GPU_TASK_ID_3, "Load Labels");
-    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<DataLoader::load_label>(registrar,
-                                                              "Load Labels");
-  }
-}
diff --git a/examples/cpp/XDL/xdl.cu b/examples/cpp/XDL/xdl.cu
deleted file mode 100644
index b7fa92b5f4..0000000000
--- a/examples/cpp/XDL/xdl.cu
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "flexflow/utils/cuda_helper.h"
-#include "xdl.h"
-
-void DataLoader::load_sparse_input(Task const *task,
-                                   std::vector<PhysicalRegion> const &regions,
-                                   Context ctx,
-                                   Runtime *runtime) {
-  assert(regions.size() == 2);
-  assert(task->regions.size() == 2);
-  int hash = *((int *)task->args);
-  int num_sparse_inputs = hash / 1000;
-  int my_input_idx = hash % 1000;
-  SampleIdxs *meta = (SampleIdxs *)task->local_args;
-  TensorAccessorR<int64_t, 2> acc_full_input(
-      regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  TensorAccessorW<int64_t, 3> acc_batch_input(regions[1],
-                                              task->regions[1],
-                                              FID_DATA,
-                                              ctx,
-                                              runtime,
-                                              false /*readOutput*/);
-  int batch_size = acc_batch_input.rect.hi[1] - acc_batch_input.rect.lo[1] + 1;
-  int in_dim = acc_batch_input.rect.hi[0] - acc_batch_input.rect.lo[0] + 1;
-  assert(acc_full_input.rect.hi[0] - acc_full_input.rect.lo[0] + 1 ==
-         num_sparse_inputs * in_dim);
-  int64_t *input_zc;
-  checkCUDA(cudaHostAlloc(&input_zc,
-                          sizeof(int64_t) * acc_batch_input.rect.volume(),
-                          cudaHostAllocPortable | cudaHostAllocMapped));
-  assert(batch_size == meta->num_samples);
-  for (int i = 0; i < batch_size; i++) {
-    int full_offset =
-        meta->idxs[i] * num_sparse_inputs * in_dim + my_input_idx * in_dim;
-    int batch_offset = i * in_dim;
-    assert(full_offset + in_dim <= (int)acc_full_input.rect.volume());
-    for (int j = 0; j < in_dim; j++) {
-      input_zc[batch_offset + j] = acc_full_input.ptr[full_offset + j];
-    }
-  }
-  checkCUDA(cudaMemcpy(acc_batch_input.ptr,
-                       input_zc,
-                       sizeof(int64_t) * acc_batch_input.rect.volume(),
-                       cudaMemcpyHostToDevice));
-  checkCUDA(cudaFreeHost(input_zc));
-  checkCUDA(cudaDeviceSynchronize());
-  // print_tensor<2, int>(acc_batch_input.ptr, acc_batch_input.rect,
-  // "[DataLoader:load_sparse]");
-}
-
-void DataLoader::load_label(Task const *task,
-                            std::vector<PhysicalRegion> const &regions,
-                            Context ctx,
-                            Runtime *runtime) {
-  assert(regions.size() == 2);
-  assert(task->regions.size() == 2);
-  SampleIdxs *meta = (SampleIdxs *)task->local_args;
-  TensorAccessorR<float, 2> acc_full_label(
-      regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  TensorAccessorW<float, 3> acc_batch_label(regions[1],
-                                            task->regions[1],
-                                            FID_DATA,
-                                            ctx,
-                                            runtime,
-                                            false /*readOutput*/);
-  int batch_size = acc_batch_label.rect.hi[1] - acc_batch_label.rect.lo[1] + 1;
-  int num_label = acc_batch_label.rect.hi[0] - acc_batch_label.rect.lo[0] + 1;
-  assert(num_label == 1); // Kaggle dataset a has single label
-  assert(acc_batch_label.rect.hi[0] == acc_full_label.rect.hi[0]);
-  assert(acc_batch_label.rect.lo[0] == acc_full_label.rect.lo[0]);
-  float *label_zc;
-  checkCUDA(cudaHostAlloc(&label_zc,
-                          sizeof(float) * acc_batch_label.rect.volume(),
-                          cudaHostAllocPortable | cudaHostAllocMapped));
-  assert(batch_size == meta->num_samples);
-  for (int i = 0; i < batch_size; i++) {
-    int base_offset = meta->idxs[i] * num_label;
-    for (int j = 0; j < num_label; j++) {
-      label_zc[i * num_label + j] = acc_full_label.ptr[base_offset + j];
-    }
-    // printf("meta->idxs[%d]=%d label=%.2lf\n", i, meta->idxs[i], label_zc[i]);
-  }
-  checkCUDA(cudaMemcpy(acc_batch_label.ptr,
-                       label_zc,
-                       sizeof(float) * acc_batch_label.rect.volume(),
-                       cudaMemcpyHostToDevice));
-  checkCUDA(cudaFreeHost(label_zc));
-}
diff --git a/examples/cpp/XDL/xdl.h b/examples/cpp/XDL/xdl.h
deleted file mode 100644
index 39ee32ffb7..0000000000
--- a/examples/cpp/XDL/xdl.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "flexflow/model.h"
-#define MAX_NUM_SAMPLES 65536
-#define MAX_NUM_EMB 1000
-#define MAX_NUM_MLPS 100
-#define MAX_DATASET_PATH_LEN 1023
-
-using namespace Legion;
-using namespace FlexFlow;
-
-struct XDLConfig {
-  XDLConfig(void);
-  int sparse_feature_size, embedding_bag_size;
-  float loss_threshold;
-  std::vector<int> embedding_size, mlp_top;
-  std::string dataset_path;
-  int data_size;
-};
-
-struct ArgsConfig {
-  int sparse_feature_size, embedding_bag_size;
-  int embedding_size, mlp_top[MAX_NUM_MLPS];
-  char dataset_path[MAX_DATASET_PATH_LEN];
-};
-
-class DataLoader {
-public:
-  DataLoader(FFModel &ff,
-             XDLConfig const &xdl,
-             std::vector<Tensor> const &_sparse_inputs,
-             Tensor _label);
-
-  void next_batch(FFModel &ff);
-  void shuffle();
-  void reset();
-  static void load_entire_dataset(Task const *task,
-                                  std::vector<PhysicalRegion> const &regions,
-                                  Context ctx,
-                                  Runtime *runtime);
-  static void load_sparse_input(Task const *task,
-                                std::vector<PhysicalRegion> const &regions,
-                                Context ctx,
-                                Runtime *runtime);
-  static void load_sparse_input_cpu(Task const *task,
-                                    std::vector<PhysicalRegion> const &regions,
-                                    Context ctx,
-                                    Runtime *runtime);
-  static void load_label(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime);
-
-public:
-  int num_samples, next_index;
-
-private:
-  std::vector<Tensor> batch_sparse_inputs;
-  Tensor full_sparse_input, full_label, batch_label;
-};
-
-struct SampleIdxs {
-  int num_samples;
-  int idxs[MAX_NUM_SAMPLES];
-};
diff --git a/examples/cpp/candle_uno/CMakeLists.txt b/examples/cpp/candle_uno/CMakeLists.txt
deleted file mode 100644
index d438a0d895..0000000000
--- a/examples/cpp/candle_uno/CMakeLists.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-cmake_minimum_required(VERSION 3.10)
-
-project(FlexFlowExample_candle_uno)
-set(project_target candle_uno)
-
-set(CPU_SRC
-  ${FLEXFLOW_CPP_DRV_SRC}
-  candle_uno.cc
-  candle_uno.h)
-
-set(GPU_SRC
-  candle_uno.cu)
-
-cuda_add_executable(${project_target} ${CPU_SRC} ${GPU_SRC})
-target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR})
-target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES})
-
-set(BIN_DEST "bin")
-install(TARGETS ${project_target} DESTINATION ${BIN_DEST})
diff --git a/examples/cpp/candle_uno/Makefile b/examples/cpp/candle_uno/Makefile
deleted file mode 100644
index cd9e69c83b..0000000000
--- a/examples/cpp/candle_uno/Makefile
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Flags for directing the runtime makefile what to include
-DEBUG           ?= 0		# Include debugging symbols
-MAX_DIM         ?= 5		# Maximum number of dimensions
-OUTPUT_LEVEL    ?= LEVEL_DEBUG	# Compile time logging level
-USE_CUDA        ?= 1		# Include CUDA support (requires CUDA)
-USE_GASNET      ?= 1		# Include GASNet support (requires GASNet)
-USE_HDF         ?= 1		# Include HDF5 support (requires HDF5)
-ALT_MAPPERS     ?= 0		# Include alternative mappers (not recommended)
-
-# Put the binary file name here
-OUTFILE		?= candle_uno
-# List all the application source files here
-GEN_SRC		= candle_uno.cc
-GEN_GPU_SRC	= candle_uno.cu
-
-ifndef FF_HOME
-$(error FF_HOME variable is not defined, aborting build)
-endif
-
-include $(FF_HOME)/FlexFlow.mk
-
diff --git a/examples/cpp/candle_uno/candle_uno.cc b/examples/cpp/candle_uno/candle_uno.cc
deleted file mode 100644
index efd6ed6949..0000000000
--- a/examples/cpp/candle_uno/candle_uno.cc
+++ /dev/null
@@ -1,462 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "candle_uno.h"
-#include <fstream>
-#include <sstream>
-#include <string>
-
-using namespace Legion;
-using namespace std;
-
-LegionRuntime::Logger::Category log_app("Candle_Uno");
-
-void parse_input_args(char **argv, int argc, CandleConfig &apConfig);
-
-CandleConfig::CandleConfig(void) {
-  // Set default configurations here
-  for (int i = 0; i < 4; i++) {
-    dense_layers.push_back(4192);
-  }
-  for (int i = 0; i < 8; i++) {
-    dense_feature_layers.push_back(4192);
-  }
-  feature_shapes["dose"] = 1;
-  feature_shapes["cell.rnaseq"] = 942;
-  feature_shapes["drug.descriptors"] = 5270;
-  feature_shapes["drug.fingerprints"] = 2048;
-  input_features["dose1"] = "dose";
-  input_features["dose2"] = "dose";
-  input_features["cell.rnaseq"] = "cell.rnaseq";
-  input_features["drug1.descriptors"] = "drug.descriptors";
-  input_features["drug1.fingerprints"] = "drug.fingerprints";
-  input_features["drug2.descriptors"] = "drug.descriptors";
-  input_features["drug2.fingerprints"] = "drug.fingerprints";
-}
-
-Tensor build_feature_model(FFModel *model,
-                           Tensor const &input,
-                           std::vector<int> const &dense_layers) {
-  Tensor t = input;
-  for (size_t i = 0; i < dense_layers.size(); i++) {
-    t = model->dense(t, dense_layers[i], AC_MODE_RELU, false /*bias*/);
-  }
-  return t;
-}
-
-void print_vector(std::string const &name, std::vector<int> const &vector) {
-  std::ostringstream out;
-  for (size_t i = 0; i < vector.size() - 1; i++) {
-    out << vector[i] << " ";
-  }
-  if (vector.size() > 0) {
-    out << vector[vector.size() - 1];
-  }
-  log_app.print("%s: %s", name.c_str(), out.str().c_str());
-}
-
-void FlexFlow::top_level_task(Task const *task,
-                              std::vector<PhysicalRegion> const &regions,
-                              Context ctx,
-                              Runtime *runtime) {
-  FFConfig ff_config;
-  CandleConfig candle_config;
-  {
-    InputArgs const &command_args = HighLevelRuntime::get_input_args();
-    char **argv = command_args.argv;
-    int argc = command_args.argc;
-    parse_input_args(argv, argc, candle_config);
-    log_app.print("batchSize(%d) workersPerNodes(%d) numNodes(%d)",
-                  ff_config.batchSize,
-                  ff_config.workersPerNode,
-                  ff_config.numNodes);
-    print_vector("Dense Layers", candle_config.dense_layers);
-    print_vector("Dense Feature Layers", candle_config.dense_feature_layers);
-  }
-
-  FFModel ff(ff_config);
-  set<string> input_models;
-  map<string, string> input_features = candle_config.input_features;
-  map<string, int> feature_shapes = candle_config.feature_shapes;
-  for (map<string, int>::const_iterator it = feature_shapes.begin();
-       it != feature_shapes.end();
-       it++) {
-    string fea_type = it->first;
-    if (fea_type.find(".") != string::npos) {
-      string base_type = fea_type.substr(0, fea_type.find("."));
-      if (base_type == "cell" || base_type == "drug") {
-        input_models.insert(it->first);
-      }
-    }
-  }
-  int n = 0;
-  std::vector<Tensor> all_inputs;
-  Tensor encoded_inputs[MAX_NUM_INPUTS];
-  for (map<string, string>::const_iterator it = input_features.begin();
-       it != input_features.end();
-       it++) {
-    assert(feature_shapes.find(it->second) != feature_shapes.end());
-    int shape = feature_shapes[it->second];
-    int const dims[] = {ff_config.batchSize, shape};
-    Tensor input = ff.create_tensor<2>(dims, DT_FLOAT);
-    all_inputs.push_back(input);
-    if (input_models.find(it->second) != input_models.end()) {
-      Tensor encoded =
-          build_feature_model(&ff, input, candle_config.dense_feature_layers);
-      encoded_inputs[n++] = encoded;
-    } else {
-      encoded_inputs[n++] = input;
-    }
-  }
-  Tensor output = ff.concat(n, encoded_inputs, -1 /*axis*/);
-  for (size_t i = 0; i < candle_config.dense_layers.size(); i++) {
-    int out_dim = candle_config.dense_layers[i];
-    output = ff.dense(output, out_dim, AC_MODE_RELU, false /*bias*/);
-  }
-  output = ff.dense(output, 1, AC_MODE_NONE, false /*bias*/);
-  // Tensor label;
-  //{
-  //   const int dims[] = {1, ff_config.batchSize, 1};
-  //   label = ff.create_tensor<3>(dims, DT_FLOAT);
-  // }
-  // ff.mse_loss("mse_loss", output, label, "average"/*reduction*/);
-  //  Use SGD Optimizer
-  Optimizer *optimizer = new SGDOptimizer(&ff, 0.01f);
-  std::vector<MetricsType> metrics;
-  // metrics.push_back(METRICS_ACCURACY);
-  metrics.push_back(METRICS_MEAN_SQUARED_ERROR);
-  ff.compile(optimizer, LOSS_MEAN_SQUARED_ERROR_AVG_REDUCE, metrics);
-  // Data Loader
-  DataLoader data_loader(ff, candle_config, all_inputs, ff.label_tensor);
-  data_loader.next_batch(ff);
-  data_loader.reset();
-  ff.init_operators();
-
-  log_app.print("Warmup finished...Start timer...");
-  log_app.print("Num. epochs = %d", ff_config.epochs);
-  log_app.print("Num. iterations/epoch = %d",
-                data_loader.num_samples / ff_config.batchSize);
-  double ts_start = Realm::Clock::current_time_in_microseconds();
-  for (int epoch = 0; epoch < ff_config.epochs; epoch++) {
-    data_loader.reset();
-    ff.reset_metrics();
-    int iterations = data_loader.num_samples / ff_config.batchSize;
-    for (int iter = 0; iter < iterations; iter++) {
-      if (candle_config.dataset_path.length() == 0) {
-        // Only load data once for random input
-        if (iter == 0 && epoch == 0) {
-          data_loader.next_batch(ff);
-        }
-      } else {
-        data_loader.next_batch(ff);
-      }
-      runtime->begin_trace(ctx, 111 /*trace_id*/);
-      ff.forward();
-      ff.zero_gradients();
-      ff.backward();
-      ff.update();
-      runtime->end_trace(ctx, 111 /*trace_id*/);
-    }
-  }
-  runtime->issue_execution_fence(ctx);
-  TimingLauncher timer(MEASURE_MICRO_SECONDS);
-  Future future = runtime->issue_timing_measurement(ctx, timer);
-  future.get_void_result();
-  double ts_end = Realm::Clock::current_time_in_microseconds();
-  double run_time = 1e-6 * (ts_end - ts_start);
-  printf("ELAPSED TIME = %.4fs, THROUGHPUT = %.2f samples/s\n",
-         run_time,
-         data_loader.num_samples * ff_config.epochs / run_time);
-}
-
-void parse_input_args(char **argv, int argc, CandleConfig &config) {
-  for (int i = 1; i < argc; i++) {
-    if (!strcmp(argv[i], "--dense-layers")) {
-      std::stringstream ss(std::string(argv[++i]));
-      std::string word;
-      config.dense_layers.clear();
-      while (std::getline(ss, word, '-')) {
-        config.dense_layers.push_back(std::stoi(word));
-      }
-      continue;
-    }
-    if (!strcmp(argv[i], "--dense-feature-layers")) {
-      std::stringstream ss(std::string(argv[++i]));
-      std::string word;
-      config.dense_feature_layers.clear();
-      while (std::getline(ss, word, '-')) {
-        config.dense_feature_layers.push_back(std::stoi(word));
-      }
-      continue;
-    }
-    if (!strcmp(argv[i], "--dataset")) {
-      config.dataset_path = std::string(argv[++i]);
-      continue;
-    }
-  }
-}
-
-size_t get_file_size(std::string const &filename) {
-  streampos begin, end;
-  ifstream file(filename.c_str(), ios::binary);
-  begin = file.tellg();
-  file.seekg(0, ios::end);
-  end = file.tellg();
-  file.close();
-  size_t filesize = end - begin;
-  printf("filesize(%s) = %zu\n", filename.c_str(), filesize);
-  return filesize;
-}
-
-DataLoader::DataLoader(FFModel &ff,
-                       CandleConfig const &candle,
-                       std::vector<Tensor> const &_inputs,
-                       Tensor _label) {
-  Context ctx = ff.config.lg_ctx;
-  Runtime *runtime = ff.config.lg_hlr;
-  num_samples = 0;
-  if (candle.dataset_path == "") {
-    log_app.print("Use random dataset...");
-    num_samples = 256 * ff.config.workersPerNode * ff.config.numNodes;
-  } else {
-    log_app.print("Start loading dataset from %s", candle.dataset_path.c_str());
-    assert(_inputs.size() == candle.input_features.size());
-    string dose1_name = candle.dataset_path + "/dose1";
-    num_samples = get_file_size(dose1_name) / 4;
-    // inputs
-    int idx = 0;
-    for (map<string, string>::const_iterator it = candle.input_features.begin();
-         it != candle.input_features.end();
-         it++) {
-      string filename = candle.dataset_path + it->first;
-      size_t filesize = get_file_size(filename);
-      assert(filesize ==
-             (size_t)num_samples * sizeof(float) * _inputs[idx++]->dims[0]);
-    }
-    // labels
-    {
-      string filename = candle.dataset_path + "/label";
-      assert(get_file_size(filename) == (size_t)num_samples * sizeof(float));
-    }
-  }
-  return;
-  for (size_t i = 0; i < _inputs.size(); i++) {
-    batch_inputs.push_back(_inputs[i]);
-    int const dims[] = {num_samples, _inputs[i]->dims[0]};
-    Tensor full_input = ff.create_tensor<2>(dims, DT_FLOAT);
-    full_inputs.push_back(full_input);
-  }
-  {
-    batch_label = _label;
-    int const dims[] = {num_samples, 1};
-    full_label = ff.create_tensor<2>(dims, DT_FLOAT);
-  }
-  // Load entire dataset
-  // TODO: Use index launcher instead of task launcher
-  CandleConfig const *ptr = &candle;
-  TaskLauncher launcher(CUSTOM_CPU_TASK_ID_1,
-                        TaskArgument(&ptr, sizeof(CandleConfig *)));
-  // regions[0]: full_label
-  launcher.add_region_requirement(
-      RegionRequirement(full_label->parallel_tensor->region,
-                        WRITE_ONLY,
-                        EXCLUSIVE,
-                        full_label->parallel_tensor->region,
-                        MAP_TO_ZC_MEMORY));
-  launcher.add_field(0, FID_DATA);
-  // regions[1-n]: full_inputs
-  for (size_t i = 0; i < full_inputs.size(); i++) {
-    launcher.add_region_requirement(
-        RegionRequirement(full_inputs[i]->parallel_tensor->region,
-                          WRITE_ONLY,
-                          EXCLUSIVE,
-                          full_inputs[i]->parallel_tensor->region,
-                          MAP_TO_ZC_MEMORY));
-    launcher.add_field(i + 1, FID_DATA);
-  }
-  runtime->execute_task(ctx, launcher);
-}
-
-void DataLoader::load_entire_dataset(Task const *task,
-                                     std::vector<PhysicalRegion> const &regions,
-                                     Context ctx,
-                                     Runtime *runtime) {
-  CandleConfig *candle = *((CandleConfig **)task->args);
-  assert(regions.size() == candle->input_features.size() + 1);
-  assert(task->regions.size() == regions.size());
-  AccessorWO<float, 2> const acc_label(regions[0], FID_DATA);
-  Rect<2> rect_label = runtime->get_index_space_domain(
-      ctx, task->regions[0].region.get_index_space());
-  assert(acc_label.accessor.is_dense_arbitrary(rect_label));
-  float *label_ptr = acc_label.ptr(rect_label.lo);
-  int num_samples = rect_label.hi[1] - rect_label.lo[1] + 1;
-  if (candle->dataset_path.length() == 0) {
-    log_app.print("Start generating random input samples");
-    for (size_t i = 0; i < rect_label.volume(); i++) {
-      label_ptr[i] = ((float)std::rand()) / RAND_MAX - 0.5f;
-    }
-  } else {
-    string filename = candle->dataset_path + "/label";
-    log_app.print("Start loading labels from %s", filename.c_str());
-    FILE *file = fopen(filename.c_str(), "rb");
-    size_t ret = fread(label_ptr, sizeof(float), rect_label.volume(), file);
-    assert(ret == rect_label.volume());
-    fclose(file);
-  }
-  int idx = 0;
-  for (map<string, string>::const_iterator it = candle->input_features.begin();
-       it != candle->input_features.end();
-       it++, idx++) {
-    printf("idx = %d\n", idx);
-    AccessorWO<float, 2> const acc_input(regions[idx + 1], FID_DATA);
-    Rect<2> rect_input = runtime->get_index_space_domain(
-        ctx, task->regions[idx + 1].region.get_index_space());
-    assert(acc_input.accessor.is_dense_arbitrary(rect_input));
-    float *input_ptr = acc_input.ptr(rect_input.lo);
-    assert(num_samples == rect_input.hi[1] - rect_input.lo[1] + 1);
-    // int num_features = rect_input.hi[0] - rect_input.lo[0] + 1;
-    if (candle->dataset_path.length() == 0) {
-      for (size_t j = 0; j < rect_input.volume(); j++) {
-        input_ptr[j] = ((float)std::rand()) / RAND_MAX;
-      }
-    } else {
-      string filename = candle->dataset_path + it->first;
-      log_app.print("Start loading input feature %s from %s",
-                    it->first.c_str(),
-                    filename.c_str());
-      FILE *file = fopen(filename.c_str(), "rb");
-      size_t ret = fread(input_ptr, sizeof(float), rect_input.volume(), file);
-      assert(ret == rect_input.volume());
-      fclose(file);
-      log_app.print("Finish loading input feature %s", it->first.c_str());
-    }
-  }
-}
-
-void DataLoader::next_batch(FFModel &ff) {
-  return;
-  Context ctx = ff.config.lg_ctx;
-  Runtime *runtime = ff.config.lg_hlr;
-  assert(full_inputs.size() == batch_inputs.size());
-  // Load inputs
-  for (size_t i = 0; i < batch_inputs.size(); i++) {
-    Domain domain = runtime->get_index_space_domain(
-        ctx, batch_inputs[i]->parallel_tensor->parallel_is);
-    ArgumentMap argmap;
-    int idx = next_index;
-    for (Domain::DomainPointIterator it(domain); it; it++) {
-      SampleIdxs meta;
-      assert(ff.config.batchSize %
-             batch_inputs[i]->parallel_tensor->dims[1].size);
-      meta.num_samples = ff.config.batchSize /
-                         batch_inputs[i]->parallel_tensor->dims[1].degree;
-      for (int i = 0; i < meta.num_samples; i++) {
-        meta.idxs[i] = idx++;
-      }
-      argmap.set_point(*it, TaskArgument(&meta, sizeof(SampleIdxs)));
-    }
-    IndexLauncher launcher(
-        CUSTOM_GPU_TASK_ID_1,
-        batch_inputs[i]->parallel_tensor->parallel_is,
-        TaskArgument(&i, sizeof(int)),
-        argmap,
-        Predicate::TRUE_PRED,
-        false /*must*/,
-        0 /*mapper_id*/,
-        batch_inputs[i]->parallel_tensor->machine_view.hash());
-    launcher.add_region_requirement(
-        RegionRequirement(full_inputs[i]->parallel_tensor->region,
-                          0 /*projection id*/,
-                          READ_ONLY,
-                          EXCLUSIVE,
-                          full_inputs[i]->parallel_tensor->region,
-                          MAP_TO_ZC_MEMORY));
-    launcher.add_field(0, FID_DATA);
-    launcher.add_region_requirement(
-        RegionRequirement(batch_inputs[i]->parallel_tensor->part,
-                          0 /*projection id*/,
-                          WRITE_ONLY,
-                          EXCLUSIVE,
-                          batch_inputs[i]->parallel_tensor->region));
-    launcher.add_field(1, FID_DATA);
-    runtime->execute_index_space(ctx, launcher);
-  }
-  // Load label
-  {
-    Domain domain = runtime->get_index_space_domain(
-        ctx, batch_label->parallel_tensor->parallel_is);
-    ArgumentMap argmap;
-    int idx = next_index;
-    for (Domain::DomainPointIterator it(domain); it; it++) {
-      SampleIdxs meta;
-      assert(ff.config.batchSize == batch_label->parallel_tensor->dims[1].size);
-      meta.num_samples =
-          ff.config.batchSize / batch_label->parallel_tensor->dims[1].degree;
-      for (int i = 0; i < meta.num_samples; i++) {
-        meta.idxs[i] = idx++;
-      }
-      argmap.set_point(*it, TaskArgument(&meta, sizeof(SampleIdxs)));
-    }
-    IndexLauncher launcher(CUSTOM_GPU_TASK_ID_1,
-                           batch_label->parallel_tensor->parallel_is,
-                           TaskArgument(NULL, 0),
-                           argmap,
-                           Predicate::TRUE_PRED,
-                           false /*must*/,
-                           0 /*mapper_id*/,
-                           batch_label->parallel_tensor->machine_view.hash());
-    launcher.add_region_requirement(
-        RegionRequirement(full_label->parallel_tensor->region,
-                          0 /*projection id*/,
-                          READ_ONLY,
-                          EXCLUSIVE,
-                          full_label->parallel_tensor->region,
-                          MAP_TO_ZC_MEMORY));
-    launcher.add_field(0, FID_DATA);
-    launcher.add_region_requirement(
-        RegionRequirement(batch_label->parallel_tensor->part,
-                          0 /*projection id*/,
-                          WRITE_ONLY,
-                          EXCLUSIVE,
-                          batch_label->parallel_tensor->region));
-    launcher.add_field(1, FID_DATA);
-    runtime->execute_index_space(ctx, launcher);
-  }
-  // progress next_index
-  next_index += ff.config.batchSize;
-}
-
-void DataLoader::reset() {
-  next_index = 0;
-}
-
-void FlexFlow::register_custom_tasks() {
-  // Load entire dataset
-  {
-    TaskVariantRegistrar registrar(CUSTOM_CPU_TASK_ID_1, "Load Entire Dataset");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<DataLoader::load_entire_dataset>(
-        registrar, "Load Entire Dataset Task");
-  }
-  // Load input
-  {
-    TaskVariantRegistrar registrar(CUSTOM_GPU_TASK_ID_1, "Load Inputs");
-    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<DataLoader::load_input>(
-        registrar, "Load Input Task");
-  }
-}
diff --git a/examples/cpp/candle_uno/candle_uno.cu b/examples/cpp/candle_uno/candle_uno.cu
deleted file mode 100644
index 72002e9354..0000000000
--- a/examples/cpp/candle_uno/candle_uno.cu
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "candle_uno.h"
-#include "flexflow/utils/cuda_helper.h"
-
-void DataLoader::load_input(Task const *task,
-                            std::vector<PhysicalRegion> const &regions,
-                            Context ctx,
-                            Runtime *runtime) {
-  assert(regions.size() == 2);
-  assert(task->regions.size() == 2);
-  SampleIdxs *meta = (SampleIdxs *)task->local_args;
-  TensorAccessorR<float, 2> acc_full_input(
-      regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  TensorAccessorW<float, 2> acc_batch_input(regions[1],
-                                            task->regions[1],
-                                            FID_DATA,
-                                            ctx,
-                                            runtime,
-                                            false /*readOutput*/);
-  int batch_size = acc_batch_input.rect.hi[1] - acc_batch_input.rect.lo[1] + 1;
-  int num_feats = acc_batch_input.rect.hi[0] - acc_batch_input.rect.lo[0] + 1;
-  assert(acc_batch_input.rect.hi[0] == acc_full_input.rect.hi[0]);
-  assert(acc_batch_input.rect.lo[0] == acc_full_input.rect.lo[0]);
-  // FIXME: currently assume continous indices
-  assert(batch_size == meta->num_samples);
-  for (int i = 1; i < batch_size; i++) {
-    assert(meta->idxs[i] == meta->idxs[0] + i);
-  }
-  float const *input_zc = acc_full_input.ptr + meta->idxs[0] * num_feats;
-  copy_kernel<<<GET_BLOCKS(acc_batch_input.rect.volume()), CUDA_NUM_THREADS>>>(
-      acc_batch_input.ptr, input_zc, acc_batch_input.rect.volume());
-  checkCUDA(cudaDeviceSynchronize());
-}
diff --git a/examples/cpp/candle_uno/candle_uno.h b/examples/cpp/candle_uno/candle_uno.h
deleted file mode 100644
index aa59050bcf..0000000000
--- a/examples/cpp/candle_uno/candle_uno.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "flexflow/model.h"
-#define MAX_NUM_SAMPLES 4196
-
-using namespace Legion;
-using namespace FlexFlow;
-using namespace std;
-
-struct CandleConfig {
-  CandleConfig(void);
-  vector<int> dense_layers, dense_feature_layers;
-  map<string, int> feature_shapes;
-  map<string, string> input_features;
-  std::string dataset_path;
-};
-
-class DataLoader {
-public:
-  DataLoader(FFModel &ff,
-             CandleConfig const &candle,
-             std::vector<Tensor> const &_all_inputs,
-             Tensor _label);
-  static void load_input(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime);
-  static void load_entire_dataset(Task const *task,
-                                  std::vector<PhysicalRegion> const &regions,
-                                  Context ctx,
-                                  Runtime *runtime);
-  void next_batch(FFModel &);
-  void reset(void);
-
-public:
-  int num_samples, next_index;
-  std::vector<Tensor> full_inputs, batch_inputs;
-  Tensor full_label, batch_label;
-};
-
-struct SampleIdxs {
-  int num_samples;
-  int idxs[MAX_NUM_SAMPLES];
-};
diff --git a/examples/cpp/mixture_of_experts/CMakeLists.txt b/examples/cpp/mixture_of_experts/CMakeLists.txt
deleted file mode 100644
index 64a89e887a..0000000000
--- a/examples/cpp/mixture_of_experts/CMakeLists.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-cmake_minimum_required(VERSION 3.10)
-
-project(FlexFlowExample_MoE)
-set(project_target moe)
-
-set(CPU_SRC
-  ${FLEXFLOW_CPP_DRV_SRC}
-  moe.cc
-  moe.h)
-
-set(GPU_SRC
-  moe.cu)
-
-cuda_add_executable(${project_target} ${CPU_SRC} ${GPU_SRC})
-target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR})
-target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES})
-
-set(BIN_DEST "bin")
-install(TARGETS ${project_target} DESTINATION ${BIN_DEST})
-
diff --git a/examples/cpp/mixture_of_experts/Makefile b/examples/cpp/mixture_of_experts/Makefile
deleted file mode 100644
index 8963a1e97f..0000000000
--- a/examples/cpp/mixture_of_experts/Makefile
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Flags for directing the runtime makefile what to include
-DEBUG           ?= 1		# Include debugging symbols
-MAX_DIM         ?= 4		# Maximum number of dimensions
-OUTPUT_LEVEL    ?= LEVEL_DEBUG	# Compile time logging level
-USE_CUDA        ?= 1		# Include CUDA support (requires CUDA)
-USE_GASNET      ?= 0		# Include GASNet support (requires GASNet)
-USE_HDF         ?= 0		# Include HDF5 support (requires HDF5)
-ALT_MAPPERS     ?= 0		# Include alternative mappers (not recommended)
-
-# Put the binary file name here
-OUTFILE		?= moe
-# List all the application source files here
-GEN_SRC		= moe.cc
-GEN_GPU_SRC	= moe.cu
-
-ifndef FF_HOME
-$(error FF_HOME variable is not defined, aborting build)
-endif
-
-include $(FF_HOME)/FlexFlow.mk
diff --git a/examples/cpp/mixture_of_experts/moe.cc b/examples/cpp/mixture_of_experts/moe.cc
deleted file mode 100644
index 32b18e0987..0000000000
--- a/examples/cpp/mixture_of_experts/moe.cc
+++ /dev/null
@@ -1,574 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "moe.h"
-#include <fstream>
-#include <sstream>
-#include <string>
-
-using namespace Legion;
-
-LegionRuntime::Logger::Category log_app("MoE");
-
-void parse_input_args(char **argv, int argc, MoeConfig &config) {
-  for (int i = 1; i < argc; i++) {
-    if (!strcmp(argv[i], "--dataset")) {
-      config.dataset_path = std::string(argv[++i]);
-      continue;
-    }
-  }
-}
-
-#ifdef DEADCODE
-// =============================================================================
-//  User-defined functions on using cached expert assignments
-// =============================================================================
-
-// Score: Running average over sample ratio of which experts are corr. cached
-float moe_score(float *cached_score,
-                void const *input,
-                void const *cached,
-                int vol) {
-  float gamma = 0.99f;
-  *cached_score *= gamma;
-  int *cast_input = (int *)input;
-  int *cast_cached = (int *)cached;
-  int batch_size = vol / num_select;
-  float frac = (1.0f - gamma) / batch_size;
-  for (int i = 0; i < batch_size; i++) {
-    std::set<int, std::greater<int>> cached;
-    std::set<int, std::greater<int>> input;
-    for (int j = 0; j < num_select; j++) {
-      cached.insert(cast_input[i * num_select + j]);
-      input.insert(cast_cached[i * num_select + j]);
-    }
-    if (cached == input) {
-      *cached_score += frac;
-    }
-  }
-  return *cached_score;
-}
-
-// Trigger: If average score of all cache layers is above thresh
-bool moe_trigger(FFModel *ff) {
-  float thresh = 0.9f;
-
-  int num_futures = 0;
-  float score = 0.0f;
-  for (size_t i = 0; i < ff->layers.size(); i++) {
-    if (ff->layers[i]->op_type == OP_CACHE) {
-      int num_futures_i = ((Cache *)ff->layers[i])->score_futures.size();
-      num_futures += num_futures_i;
-      for (int j = 0; j < num_futures_i; j++) {
-        score += ((Cache *)ff->layers[i])->score_futures[j].get_result<float>();
-      }
-    }
-  }
-  return score >= thresh;
-}
-
-// Alter: GroupBy, Aggregate, AggregateSpec use cached values for expert assign.
-void moe_alter(FFModel *ff) {
-  ((Cache *)ff->layers[3])->use_cached(true);
-  // Group by input
-  ff->layers[4]->inputs[1] = ff->layers[3]->outputs[0];
-  ff->layers[4]->input_lps[1] = ff->layers[3]->outputs[0].part;
-  ff->layers[4]->input_grad_lps[1] = ff->layers[3]->outputs[0].part_grad;
-  // Aggregate input
-  ff->layers[16]->inputs[1] = ff->layers[3]->outputs[0];
-  ff->layers[16]->input_lps[1] = ff->layers[3]->outputs[0].part;
-  ff->layers[16]->input_grad_lps[1] = ff->layers[3]->outputs[0].part_grad;
-  // AggregateSpec input
-  ff->layers[17]->inputs[1] = ff->layers[3]->outputs[0];
-  ff->layers[17]->input_lps[1] = ff->layers[3]->outputs[0].part;
-  ff->layers[17]->input_grad_lps[1] = ff->layers[3]->outputs[0].part_grad;
-}
-#endif // DEADCODE
-
-Tensor create_moe_encoder(FFModel *model,
-                          MoeConfig const *moeConfig,
-                          Tensor const &input) {
-  std::vector<int> axes = {0, 1};
-  Tensor x = input;
-  for (int i = 0; i < moeConfig->num_encoder_layers; i++) {
-    x = model->layer_norm(
-        model->add(model->multihead_attention(x,
-                                              x,
-                                              x,
-                                              moeConfig->hidden_size,
-                                              moeConfig->num_attention_heads,
-                                              moeConfig->attention_kdim,
-                                              moeConfig->attention_vdim),
-                   x),
-        axes,
-        true,
-        1e-05);
-    x = model->layer_norm(model->add(model->moe(x,
-                                                moeConfig->num_exp,
-                                                moeConfig->num_select,
-                                                moeConfig->hidden_size,
-                                                moeConfig->alpha,
-                                                moeConfig->lambda),
-                                     x),
-                          axes,
-                          true,
-                          1e-05);
-  }
-  return x;
-}
-
-void FlexFlow::top_level_task(Task const *task,
-                              std::vector<PhysicalRegion> const &regions,
-                              Context ctx,
-                              Runtime *runtime) {
-  FFConfig ffConfig;
-  MoeConfig moeConfig;
-  {
-    InputArgs const &command_args = HighLevelRuntime::get_input_args();
-    char **argv = command_args.argv;
-    int argc = command_args.argc;
-    parse_input_args(argv, argc, moeConfig);
-    log_app.print("batchSize(%d) workersPerNodes(%d) numNodes(%d)",
-                  ffConfig.batchSize,
-                  ffConfig.workersPerNode,
-                  ffConfig.numNodes);
-  }
-  FFModel ff(ffConfig);
-
-  Tensor input;
-  {
-    int const dims[] = {ffConfig.batchSize, DATA_DIMS};
-    input = ff.create_tensor<2>(dims, DT_FLOAT);
-  }
-
-  //-----------------------------------------------------------------
-
-  // Tensor t = create_moe_encoder(&ff, &moeConfig, input);
-  Tensor t = ff.moe(input,
-                    moeConfig.num_exp,
-                    moeConfig.num_select,
-                    moeConfig.hidden_size,
-                    moeConfig.alpha,
-                    moeConfig.lambda);
-  t = ff.dense(t, OUT_DIM, AC_MODE_RELU);
-
-  //-----------------------------------------------------------------
-
-  Optimizer *optimizer = new SGDOptimizer(&ff, 0.001f);
-  std::vector<MetricsType> metrics;
-  metrics.push_back(METRICS_ACCURACY);
-  metrics.push_back(METRICS_SPARSE_CATEGORICAL_CROSSENTROPY);
-  ff.compile(optimizer, LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, metrics);
-
-  // Data Loader
-  ParallelTensor input_pt, label_pt;
-  ff.get_parallel_tensor_from_tensor(input, input_pt);
-  ff.get_parallel_tensor_from_tensor(ff.label_tensor, label_pt);
-  DataLoader data_loader(ff, moeConfig, input_pt, label_pt);
-  //  RecompileState r(&moe_trigger, &moe_alter, &ff);
-  ff.init_operators();
-  // Start timer
-  {
-    runtime->issue_execution_fence(ctx);
-    TimingLauncher timer(MEASURE_MICRO_SECONDS);
-    Future future = runtime->issue_timing_measurement(ctx, timer);
-    future.get_void_result();
-  }
-  double ts_start = Realm::Clock::current_time_in_microseconds();
-  for (int epoch = 0; epoch < ffConfig.epochs; epoch++) {
-    data_loader.reset();
-    ff.reset_metrics();
-    int iterations = TRAIN_SAMPLES / ffConfig.batchSize;
-
-    for (int iter = 0; iter < iterations; iter++) {
-      data_loader.next_batch(ff);
-      if (epoch > 0) {
-        runtime->begin_trace(ctx, 111 /*trace_id*/);
-      }
-      ff.forward();
-      ff.zero_gradients();
-      ff.backward();
-      ff.update();
-      // ff.recompile_on_condition(r);
-      if (epoch > 0) {
-        runtime->end_trace(ctx, 111 /*trace_id*/);
-      }
-    }
-
-    // TODO: Do properly
-    ff.reset_metrics();
-    iterations = TEST_SAMPLES / ffConfig.batchSize;
-    for (int iter = 0; iter < iterations; iter++) {
-      data_loader.next_batch(ff);
-      ff.forward();
-      ff.backward();
-    }
-  }
-  // End timer
-  {
-    runtime->issue_execution_fence(ctx);
-    TimingLauncher timer(MEASURE_MICRO_SECONDS);
-    Future future = runtime->issue_timing_measurement(ctx, timer);
-    future.get_void_result();
-  }
-  double ts_end = Realm::Clock::current_time_in_microseconds();
-  double run_time = 1e-6 * (ts_end - ts_start);
-  printf("ELAPSED TIME = %.4fs, THROUGHPUT = %.2f samples/s\n",
-         run_time,
-         TRAIN_SAMPLES * ffConfig.epochs / run_time);
-}
-
-DataLoader::DataLoader(FFModel &ff,
-                       MoeConfig const &moe,
-                       ParallelTensor input,
-                       ParallelTensor label) {
-  num_samples = NUM_SAMPLES;
-
-  Context ctx = ff.config.lg_ctx;
-  Runtime *runtime = ff.config.lg_hlr;
-
-  // Create full input
-  {
-    // Input has dimensions (batch_size, data_dims), which in legion ordering
-    // becomes (data_dims, batch_size). The corresponding parallel tensor will
-    // thus have dimensions (data_dims, batch_size, replica_dim). The dimensions
-    // of the full_input tensor can be obtained by replacing the batch_size with
-    // the num_samples: (data_dims, num_samples, replica_dim)
-    assert(input->num_dims == 3); // two dimensions + the replica dimension
-    batch_input = input;
-
-    ParallelDim dims[3];
-    for (int i = 0; i < 3; i++) {
-      dims[i].size = input->dims[i].size;
-      dims[i].degree = 1;
-      dims[i].parallel_idx = -1;
-      dims[i].is_replica_dim = input->dims[i].is_replica_dim;
-      // Assume only the first dim can be the replica dim
-      assert(i == 2 || (!dims[i].is_replica_dim));
-    }
-    dims[1].size = num_samples;
-
-    full_input = ff.create_parallel_tensor_legion_ordering(3, dims, DT_FLOAT);
-    ff.map_tensor(full_input, NULL /*parallel_op*/);
-  }
-
-  // Create full label
-  {
-    assert(label->num_dims == LABEL_DIM + 2);
-    batch_label = label;
-
-    ParallelDim dims[LABEL_DIM + 2];
-    for (int i = 0; i < LABEL_DIM + 2; i++) {
-      dims[i].size = label->dims[i].size;
-      dims[i].degree = 1;
-      dims[i].parallel_idx = -1;
-      dims[i].is_replica_dim = label->dims[i].is_replica_dim;
-      // Assume only the last dim can be the replica dim
-      assert(i == LABEL_DIM + 1 || (!dims[i].is_replica_dim));
-    }
-    assert(dims[LABEL_DIM].size == ff.config.batchSize);
-    // replace batch size with number of samples
-    dims[LABEL_DIM].size = num_samples;
-
-    full_label = ff.create_parallel_tensor_legion_ordering(
-        LABEL_DIM + 2, dims, DT_INT32);
-    ff.map_tensor(full_label, NULL /*parallel_op*/);
-  }
-
-  // Load entire dataset
-  // TODO: Use index launcher instead of task launcher
-  assert(full_input != nullptr && "full_input is nullptr");
-
-  MoeConfig const *ptr = &moe;
-  TaskLauncher launcher(CUSTOM_CPU_TASK_ID_1,
-                        TaskArgument(&ptr, sizeof(MoeConfig *)));
-  // regions[0]: full_input
-  launcher.add_region_requirement(RegionRequirement(full_input->region,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    full_input->region,
-                                                    MAP_TO_ZC_MEMORY));
-  launcher.add_field(0, FID_DATA);
-  // regions[1]: full_label
-  launcher.add_region_requirement(RegionRequirement(full_label->region,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    full_label->region,
-                                                    MAP_TO_ZC_MEMORY));
-  launcher.add_field(1, FID_DATA);
-
-  runtime->execute_task(ctx, launcher);
-  reset();
-  next_batch(ff);
-}
-
-// =================================================
-//                    Load data
-// =================================================
-
-void read_cifar100(float *input_ptr, int *label_ptr) {
-  std::ifstream file;
-  file.open("train.bin", std::ios::in | std::ios::binary | std::ios::ate);
-  if (!file) {
-    std::cout << "Error opening CIFAR100 train data file" << std::endl;
-    assert(false);
-  }
-
-  file.seekg(0, std::ios::beg);
-
-  // each sample: <1 x coarse label><1 x fine label><3072 x pixel>
-  for (std::size_t i = 0; i < NUM_SAMPLES; i++) {
-    unsigned char temp = 0;
-    file.read((char *)&temp, sizeof(temp)); // coarse label, skip
-    file.read((char *)&temp, sizeof(temp));
-    label_ptr[i] = temp;
-    for (std::size_t j = 0; j < 3072; ++j) {
-      file.read((char *)&temp, sizeof(temp));
-      input_ptr[i * 3072 + j] = (float)temp / 255.0f;
-    }
-  }
-
-  file.close();
-}
-
-int reverseInt(int i) {
-  unsigned char c1, c2, c3, c4;
-
-  c1 = i & 255;
-  c2 = (i >> 8) & 255;
-  c3 = (i >> 16) & 255;
-  c4 = (i >> 24) & 255;
-
-  return ((int)c1 << 24) + ((int)c2 << 16) + ((int)c3 << 8) + c4;
-}
-
-/* NOTE: Download files from http://yann.lecun.com/exdb/mnist/ and unpack to
-the current working directory */
-void read_mnist(float *input_ptr, int *label_ptr) {
-  // read inputs
-  std::ifstream input("train-images-idx3-ubyte", std::ios::binary);
-  if (input.is_open()) {
-    int magic_number = 0;
-    int number_of_images = 0;
-    int n_rows = 0;
-    int n_cols = 0;
-    input.read((char *)&magic_number, sizeof(magic_number));
-    magic_number = reverseInt(magic_number);
-    input.read((char *)&number_of_images, sizeof(number_of_images));
-    number_of_images = reverseInt(number_of_images);
-    input.read((char *)&n_rows, sizeof(n_rows));
-    n_rows = reverseInt(n_rows);
-    input.read((char *)&n_cols, sizeof(n_cols));
-    n_cols = reverseInt(n_cols);
-
-    for (int i = 0; i < number_of_images; i++) {
-      for (int r = 0; r < n_rows; r++) {
-        for (int c = 0; c < n_cols; c++) {
-          unsigned char temp = 0;
-          input.read((char *)&temp, sizeof(temp));
-          input_ptr[i * n_rows * n_cols + r * n_cols + c] =
-              (float)temp / 255.0f;
-        }
-      }
-    }
-  } else {
-    std::cout << "Error opening MNIST input data file" << std::endl;
-    assert(false);
-  }
-
-  // read labels
-  std::ifstream labels("train-labels-idx1-ubyte", std::ios::binary);
-  if (labels.is_open()) {
-    int magic_number = 0;
-    int number_of_images = 0;
-    labels.read((char *)&magic_number, sizeof(magic_number));
-    magic_number = reverseInt(magic_number);
-    labels.read((char *)&number_of_images, sizeof(number_of_images));
-    number_of_images = reverseInt(number_of_images);
-
-    for (int i = 0; i < number_of_images; i++) {
-      unsigned char temp = 0;
-      labels.read((char *)&temp, sizeof(temp));
-      label_ptr[i] = temp;
-    }
-  } else {
-    std::cout << "Error opening MNIST label data file" << std::endl;
-    assert(false);
-  }
-}
-
-void DataLoader::load_entire_dataset(Task const *task,
-                                     std::vector<PhysicalRegion> const &regions,
-                                     Context ctx,
-                                     Runtime *runtime) {
-  // const MoeConfig* conf = *((MoeConfig**)task->args);
-  assert(regions.size() == 2);
-  assert(task->regions.size() == regions.size());
-
-  // get input and label pointer
-  AccessorWO<float, 3> const acc_input(regions[0], FID_DATA);
-  AccessorWO<int, LABEL_DIM + 2> const acc_label(regions[1], FID_DATA);
-  Rect<3> rect_input = runtime->get_index_space_domain(
-      ctx, task->regions[0].region.get_index_space());
-  assert(acc_input.accessor.is_dense_arbitrary(rect_input));
-  Rect<LABEL_DIM + 2> rect_label = runtime->get_index_space_domain(
-      ctx, task->regions[1].region.get_index_space());
-  assert(acc_label.accessor.is_dense_arbitrary(rect_label));
-  float *input_ptr = acc_input.ptr(rect_input.lo);
-  int *label_ptr = acc_label.ptr(rect_label.lo);
-  int num_samples = rect_input.hi[1] - rect_input.lo[1] + 1;
-  assert(rect_label.hi[1] - rect_label.lo[1] + 1 == num_samples);
-
-  // here, you can call `read_cifar100(input_ptr, label_ptr);` instead or load
-  // another dataset using the dataset_path from the MoeConfig object
-  read_mnist(input_ptr, label_ptr);
-  log_app.print("finish loading MNIST data\n");
-}
-
-void DataLoader::next_batch(FFModel &ff) {
-  Context ctx = ff.config.lg_ctx;
-  Runtime *runtime = ff.config.lg_hlr;
-  // Load input
-  {
-    Domain domain =
-        runtime->get_index_space_domain(ctx, batch_input->parallel_is);
-    ArgumentMap argmap;
-    int idx = next_index;
-    // current limitation of the dataloader: only the batch dimension can be
-    // partitioned
-    int input_dims = batch_input->num_dims;
-    for (int i = 0; i < input_dims; i++) {
-      if (i != input_dims - 2) {
-        assert(batch_input->dims[i].degree == 1 &&
-               "Dataloader only supports batch size partitions");
-      }
-    }
-    int batch_size = batch_input->dims[input_dims - 2].size;
-    int n_partitions = batch_input->dims[input_dims - 2].degree;
-    assert(ff.config.batchSize % batch_size == 0);
-    assert(batch_size % n_partitions == 0);
-    for (Domain::DomainPointIterator it(domain); it; it++) {
-      SampleIdxs meta;
-      meta.num_samples = batch_size / n_partitions;
-      for (int i = 0; i < meta.num_samples; i++) {
-        meta.idxs[i] = idx++;
-      }
-      argmap.set_point(*it, TaskArgument(&meta, sizeof(SampleIdxs)));
-    }
-    IndexLauncher launcher(CUSTOM_GPU_TASK_ID_1,
-                           batch_input->parallel_is,
-                           TaskArgument(NULL, 0),
-                           argmap,
-                           Predicate::TRUE_PRED,
-                           false /*must*/,
-                           0 /*mapper_id*/,
-                           batch_input->machine_view.hash());
-    launcher.add_region_requirement(RegionRequirement(full_input->region,
-                                                      0 /*projection id*/,
-                                                      READ_ONLY,
-                                                      EXCLUSIVE,
-                                                      full_input->region,
-                                                      MAP_TO_ZC_MEMORY));
-    launcher.add_field(0, FID_DATA);
-    launcher.add_region_requirement(RegionRequirement(batch_input->part,
-                                                      0 /*projection id*/,
-                                                      WRITE_ONLY,
-                                                      EXCLUSIVE,
-                                                      batch_input->region));
-    launcher.add_field(1, FID_DATA);
-    runtime->execute_index_space(ctx, launcher);
-  }
-  // Load label
-  {
-    Domain domain =
-        runtime->get_index_space_domain(ctx, batch_label->parallel_is);
-    ArgumentMap argmap;
-    int idx = next_index;
-    // current limitation of the dataloader: only the batch dimension can be
-    // partitioned
-    int label_dims = batch_label->num_dims;
-    assert(batch_label->dims[label_dims - 1].degree == 1);
-    for (int i = 0; i < LABEL_DIM; i++) {
-      assert(batch_label->dims[i].degree == 1 &&
-             "Dataloader only supports batch size partitions");
-    }
-    int batch_size = batch_label->dims[label_dims - 2].size;
-    int n_partitions = batch_label->dims[label_dims - 2].degree;
-    assert(ff.config.batchSize % batch_size == 0);
-    assert(batch_size % n_partitions == 0);
-    for (Domain::DomainPointIterator it(domain); it; it++) {
-      SampleIdxs meta;
-      meta.num_samples = batch_size / n_partitions;
-      for (int i = 0; i < meta.num_samples; i++) {
-        meta.idxs[i] = idx++;
-      }
-      argmap.set_point(*it, TaskArgument(&meta, sizeof(SampleIdxs)));
-    }
-    IndexLauncher launcher(CUSTOM_GPU_TASK_ID_2,
-                           batch_label->parallel_is,
-                           TaskArgument(NULL, 0),
-                           argmap,
-                           Predicate::TRUE_PRED,
-                           false /*must*/,
-                           0 /*mapper_id*/,
-                           batch_label->machine_view.hash());
-    launcher.add_region_requirement(RegionRequirement(full_label->region,
-                                                      0 /*projection id*/,
-                                                      READ_ONLY,
-                                                      EXCLUSIVE,
-                                                      full_label->region,
-                                                      MAP_TO_ZC_MEMORY));
-    launcher.add_field(0, FID_DATA);
-    launcher.add_region_requirement(RegionRequirement(batch_label->part,
-                                                      0 /*projection id*/,
-                                                      WRITE_ONLY,
-                                                      EXCLUSIVE,
-                                                      batch_label->region));
-    launcher.add_field(1, FID_DATA);
-    runtime->execute_index_space(ctx, launcher);
-  }
-  next_index += ff.config.batchSize;
-}
-
-void DataLoader::reset() {
-  next_index = 0;
-}
-
-void FlexFlow::register_custom_tasks() {
-  // Load entire dataset
-  {
-    TaskVariantRegistrar registrar(CUSTOM_CPU_TASK_ID_1, "Load Entire Dataset");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<DataLoader::load_entire_dataset>(
-        registrar, "Load Entire Dataset Task");
-  }
-  // Load input
-  {
-    TaskVariantRegistrar registrar(CUSTOM_GPU_TASK_ID_1, "Load Inputs");
-    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<DataLoader::load_input>(
-        registrar, "Load Input Task");
-  }
-  // Load label
-  {
-    TaskVariantRegistrar registrar(CUSTOM_GPU_TASK_ID_2, "Load Labels");
-    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<DataLoader::load_label>(
-        registrar, "Load Label Task");
-  }
-}
diff --git a/examples/cpp/mixture_of_experts/moe.cu b/examples/cpp/mixture_of_experts/moe.cu
deleted file mode 100644
index ae1e5aca30..0000000000
--- a/examples/cpp/mixture_of_experts/moe.cu
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "flexflow/utils/cuda_helper.h"
-#include "moe.h"
-
-void DataLoader::load_input(Task const *task,
-                            std::vector<PhysicalRegion> const &regions,
-                            Context ctx,
-                            Runtime *runtime) {
-  assert(regions.size() == 2);
-  assert(task->regions.size() == 2);
-  SampleIdxs *meta = (SampleIdxs *)task->local_args;
-  TensorAccessorR<float, 3> acc_full_input(
-      regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  TensorAccessorW<float, 3> acc_batch_input(regions[1],
-                                            task->regions[1],
-                                            FID_DATA,
-                                            ctx,
-                                            runtime,
-                                            false /*readOutput*/);
-
-  coord_t batch_size =
-      acc_batch_input.rect.hi[1] - acc_batch_input.rect.lo[1] + 1;
-  coord_t sample_dim =
-      acc_batch_input.rect.hi[0] - acc_batch_input.rect.lo[0] + 1;
-
-  // FIXME: currently assume continous indices
-  assert(batch_size == meta->num_samples);
-  for (int i = 1; i < batch_size; i++) {
-    assert(meta->idxs[i] == meta->idxs[0] + i);
-  }
-  coord_t start_idx = meta->idxs[0];
-  float const *input_zc = acc_full_input.ptr + start_idx * sample_dim;
-  copy_kernel<<<GET_BLOCKS(acc_batch_input.rect.volume()), CUDA_NUM_THREADS>>>(
-      acc_batch_input.ptr, input_zc, acc_batch_input.rect.volume());
-  checkCUDA(cudaDeviceSynchronize());
-}
-
-void DataLoader::load_label(Task const *task,
-                            std::vector<PhysicalRegion> const &regions,
-                            Context ctx,
-                            Runtime *runtime) {
-  assert(regions.size() == 2);
-  assert(task->regions.size() == 2);
-  SampleIdxs *meta = (SampleIdxs *)task->local_args;
-  TensorAccessorR<int, LABEL_DIM + 2> acc_full_label(
-      regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  TensorAccessorW<int, LABEL_DIM + 2> acc_batch_label(regions[1],
-                                                      task->regions[1],
-                                                      FID_DATA,
-                                                      ctx,
-                                                      runtime,
-                                                      false /*readOutput*/);
-  coord_t batch_size =
-      acc_batch_label.rect.hi[1] - acc_batch_label.rect.lo[1] + 1;
-  // FIXME: currently assume continous indices
-  assert(batch_size == meta->num_samples);
-  for (int i = 1; i < meta->num_samples; i++) {
-    assert(meta->idxs[i] == meta->idxs[0] + i);
-  }
-  int const *input_zc = acc_full_label.ptr + meta->idxs[0];
-  copy_kernel<<<GET_BLOCKS(acc_batch_label.rect.volume()), CUDA_NUM_THREADS>>>(
-      acc_batch_label.ptr, input_zc, acc_batch_label.rect.volume());
-  checkCUDA(cudaDeviceSynchronize());
-}
diff --git a/examples/cpp/mixture_of_experts/moe.h b/examples/cpp/mixture_of_experts/moe.h
deleted file mode 100644
index a9fd2d2325..0000000000
--- a/examples/cpp/mixture_of_experts/moe.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "flexflow/model.h"
-#define MAX_NUM_SAMPLES 1000
-#define NUM_SAMPLES 1000
-#define TRAIN_SAMPLES 1000
-#define TEST_SAMPLES 00000
-#define MNIST_DIMS 28 * 28
-#define CIFAR_DIMS 3 * 32 * 32
-#define DATA_DIMS MNIST_DIMS
-#define OUT_DIM 10
-#define LABEL_DIM 1
-
-using namespace Legion;
-using namespace std;
-using namespace FlexFlow;
-
-struct MoeConfig {
-  MoeConfig(void) {
-    // MoE layer
-    num_exp = 5;
-    num_select = 2;
-    alpha = 2.0f;
-    lambda = 0.04f;
-    hidden_size = DATA_DIMS;
-    // Encoder layer
-    num_attention_heads = 16;
-    attention_kdim = attention_vdim = hidden_size / num_attention_heads;
-    num_encoder_layers = 6;
-  }
-  // MoE layer
-  int num_exp;
-  int num_select;
-  float alpha;  // factor overhead tensor size for imbalance
-  float lambda; // multiplier for load balance term
-  int hidden_size;
-  // Encoder layer
-  int num_attention_heads;
-  int attention_kdim;
-  int attention_vdim;
-  int num_encoder_layers;
-  // Dataset
-  std::string dataset_path;
-};
-
-class DataLoader {
-public:
-  DataLoader(FFModel &ff,
-             MoeConfig const &moe,
-             ParallelTensor input,
-             ParallelTensor label);
-  static void load_input(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime);
-  static void load_label(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime);
-  static void load_entire_dataset(Task const *task,
-                                  std::vector<PhysicalRegion> const &regions,
-                                  Context ctx,
-                                  Runtime *runtime);
-  void next_batch(FFModel &);
-  void reset(void);
-
-public:
-  int num_samples, next_index;
-  FlexFlow::ParallelTensor full_input, batch_input;
-  FlexFlow::ParallelTensor full_label, batch_label;
-};
-
-struct SampleIdxs {
-  int num_samples;
-  int idxs[MAX_NUM_SAMPLES];
-};
diff --git a/examples/cpp/mixture_of_experts/run_moe.sh b/examples/cpp/mixture_of_experts/run_moe.sh
deleted file mode 100644
index 33c6c5f7fb..0000000000
--- a/examples/cpp/mixture_of_experts/run_moe.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/bin/bash
-#SBATCH --job-name=pagerank
-#SBATCH --output=slurm.txt
-#SBATCH --time=10:00
-#SBATCH --nodes=2
-#SBATCH --cpus-per-task=2
-#SBATCH --mem-per-cpu=6000MB
-#SBATCH --nodelist=g0001,g0002
-#SBATCH --partition=gpu
-
-srun -n 2 ./moe -ll:cpu 4 -ll:gpu 4 -ll:fsize 15000 -ll:zsize 15000 --nodes 2 -ll:util 1 -b 40 -e 1 --search-budget 1 --export strat-tmp.txt
diff --git a/examples/cpp/resnext50/CMakeLists.txt b/examples/cpp/resnext50/CMakeLists.txt
deleted file mode 100644
index 3a20d111ee..0000000000
--- a/examples/cpp/resnext50/CMakeLists.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-cmake_minimum_required(VERSION 3.10)
-
-project(FlexFlowExample_resnext50)
-set(project_target resnext50)
-
-set(CPU_SRC
-  ${FLEXFLOW_CPP_DRV_SRC}
-  resnext.cc)
-
-cuda_add_executable(${project_target} ${CPU_SRC})
-target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR})
-target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES})
-
-set(BIN_DEST "bin")
-install(TARGETS ${project_target} DESTINATION ${BIN_DEST})
diff --git a/examples/cpp/resnext50/resnext.cc b/examples/cpp/resnext50/resnext.cc
deleted file mode 100644
index 4b51e6bc2a..0000000000
--- a/examples/cpp/resnext50/resnext.cc
+++ /dev/null
@@ -1,140 +0,0 @@
-#include "flexflow/model.h"
-
-using namespace Legion;
-using FlexFlow::FFConfig;
-using FlexFlow::FFModel;
-using FlexFlow::Optimizer;
-using FlexFlow::SGDOptimizer;
-using FlexFlow::Tensor;
-
-LegionRuntime::Logger::Category log_app("resnext");
-
-Tensor resnext_block(FFModel &ff,
-                     Tensor input,
-                     int stride_h,
-                     int stride_w,
-                     int out_channels,
-                     int groups,
-                     bool has_residual = false) {
-  Tensor t = ff.conv2d(input, out_channels, 1, 1, 1, 1, 0, 0, AC_MODE_RELU);
-
-  t = ff.conv2d(
-      t, out_channels, 3, 3, stride_h, stride_w, 1, 1, AC_MODE_RELU, groups);
-
-  t = ff.conv2d(t, 2 * out_channels, 1, 1, 1, 1, 0, 0, AC_MODE_NONE);
-
-  if ((stride_h > 1 || input->dims[2] != out_channels * 2) && has_residual) {
-    input = ff.conv2d(
-        input, 2 * out_channels, 1, 1, stride_h, stride_w, 0, 0, AC_MODE_RELU);
-    t = ff.relu(ff.add(input, t), false);
-  }
-  return t;
-}
-
-void FlexFlow::top_level_task(Task const *task,
-                              std::vector<PhysicalRegion> const &regions,
-                              Context ctx,
-                              Runtime *runtime) {
-  FFConfig ffConfig;
-  /* { */
-  /*   const InputArgs &command_args = HighLevelRuntime::get_input_args(); */
-  /*   char **argv = command_args.argv; */
-  /*   int argc = command_args.argc; */
-  /*   parse_input_args(argv, argc, resnetConfig); */
-  log_app.print("batchSize(%d) workersPerNodes(%d) numNodes(%d)",
-                ffConfig.batchSize,
-                ffConfig.workersPerNode,
-                ffConfig.numNodes);
-  /* } */
-  FFModel ff(ffConfig);
-
-  Tensor input;
-  {
-    int const dims[] = {ffConfig.batchSize, 3, 224, 224};
-    input = ff.create_tensor<4>(dims, DT_FLOAT);
-  }
-
-  Tensor t = input;
-  t = ff.conv2d(t, 64, 7, 7, 2, 2, 3, 3, AC_MODE_RELU);
-  t = ff.pool2d(t, 3, 3, 2, 2, 1, 1, POOL_MAX);
-
-  int stride;
-
-  stride = 1;
-  for (int i = 0; i < 3; i++) {
-    t = resnext_block(ff, t, stride, stride, 128, 32);
-  }
-  stride = 2;
-  for (int i = 0; i < 4; i++) {
-    t = resnext_block(ff, t, stride, stride, 256, 32);
-    stride = 1;
-  }
-  stride = 2;
-  for (int i = 0; i < 6; i++) {
-    t = resnext_block(ff, t, stride, stride, 512, 32);
-    stride = 1;
-  }
-  stride = 2;
-  for (int i = 0; i < 3; i++) {
-    t = resnext_block(ff, t, stride, stride, 1024, 32);
-    stride = 1;
-  }
-
-  t = ff.relu(t, false);
-  t = ff.pool2d(t, t->dims[0], t->dims[1], 1, 1, 0, 0, POOL_AVG);
-  t = ff.flat(t);
-  t = ff.dense(t, 1000 /*1000*/);
-  t = ff.softmax(t);
-
-  Optimizer *optimizer = new SGDOptimizer(&ff, 0.001f);
-  std::vector<MetricsType> metrics;
-  metrics.push_back(METRICS_ACCURACY);
-  metrics.push_back(METRICS_SPARSE_CATEGORICAL_CROSSENTROPY);
-  ff.compile(optimizer, LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, metrics);
-  // Data Loader
-  /* DataLoader data_loader(ff, resnetConfig, input, ff.label_tensor); */
-  ff.init_operators();
-  // Start timer
-  {
-    runtime->issue_execution_fence(ctx);
-    TimingLauncher timer(MEASURE_MICRO_SECONDS);
-    Future future = runtime->issue_timing_measurement(ctx, timer);
-    future.get_void_result();
-  }
-  double ts_start = Realm::Clock::current_time_in_microseconds();
-  for (int epoch = 0; epoch < ffConfig.epochs; epoch++) {
-    /* data_loader.reset(); */
-    ff.reset_metrics();
-    int iterations = 128; // data_loader.num_samples / ffConfig.batchSize;
-
-    for (int iter = 0; iter < iterations; iter++) {
-      /* if (resnetConfig.dataset_path.length() == 0) { */
-      /*   // Only load data once for random input */
-      /*   if (iter == 0 && epoch == 0) */
-      /*     data_loader.next_batch(ff); */
-      /* } else { */
-      /*   data_loader.next_batch(ff); */
-      /* } */
-      runtime->begin_trace(ctx, 111 /*trace_id*/);
-      ff.forward();
-      ff.zero_gradients();
-      ff.backward();
-      ff.update();
-      runtime->end_trace(ctx, 111 /*trace_id*/);
-    }
-  }
-  // End timer
-  {
-    runtime->issue_execution_fence(ctx);
-    TimingLauncher timer(MEASURE_MICRO_SECONDS);
-    Future future = runtime->issue_timing_measurement(ctx, timer);
-    future.get_void_result();
-  }
-  double ts_end = Realm::Clock::current_time_in_microseconds();
-  double run_time = 1e-6 * (ts_end - ts_start);
-  printf("ELAPSED TIME = %.4fs, THROUGHPUT = %.2f samples/s\n",
-         run_time,
-         128 * ffConfig.batchSize * ffConfig.epochs / run_time);
-}
-
-void FlexFlow::register_custom_tasks() {}
diff --git a/examples/cpp/split_test/CMakeLists.txt b/examples/cpp/split_test/CMakeLists.txt
deleted file mode 100644
index d53574a32a..0000000000
--- a/examples/cpp/split_test/CMakeLists.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-cmake_minimum_required(VERSION 3.10)
-
-project(FlexFlowExample_split_test)
-set(project_target split_test)
-
-set(CPU_SRC 
-    ${FLEXFLOW_CPP_DRV_SRC}
-    split_test.cc)
-
-cuda_add_executable(${project_target} ${CPU_SRC})
-target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR})
-target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES})
-
-set(BIN_DEST "bin")
-install(TARGETS ${project_target} DESTINATION ${BIN_DEST})
diff --git a/examples/cpp/split_test/split_test.cc b/examples/cpp/split_test/split_test.cc
deleted file mode 100644
index 234f504c7a..0000000000
--- a/examples/cpp/split_test/split_test.cc
+++ /dev/null
@@ -1,83 +0,0 @@
-#include "flexflow/model.h"
-
-using namespace Legion;
-using namespace FlexFlow;
-
-LegionRuntime::Logger::Category log_app("split_test");
-
-void FlexFlow::top_level_task(Task const *task,
-                              std::vector<PhysicalRegion> const &regions,
-                              Context ctx,
-                              Runtime *runtime) {
-  int layer_dims[4] = {256, 128, 64, 32};
-
-  FFConfig ffConfig;
-  log_app.print("batchSize(%d) workersPerNodes(%d) numNodes(%d)",
-                ffConfig.batchSize,
-                ffConfig.workersPerNode,
-                ffConfig.numNodes);
-  FFModel ff(ffConfig);
-
-  Tensor input;
-  {
-    int const dims[] = {1, ffConfig.batchSize, layer_dims[0]};
-    input = ff.create_tensor<3>(dims, DT_FLOAT);
-    log_app.print("input size: %d %d %d", dims[0], dims[1], dims[2]);
-  }
-
-  Tensor t, t1, t2;
-
-  t = input;
-  t = ff.dense(input, layer_dims[1]);
-  t = ff.relu(t);
-  t1 = ff.dense(t, layer_dims[2]);
-  t2 = ff.dense(t, layer_dims[2]);
-  t = ff.add(t1, t2);
-  t = ff.relu(t);
-  t1 = ff.dense(t, layer_dims[3]);
-  t2 = ff.dense(t, layer_dims[3]);
-  t = ff.add(t1, t2);
-  t = ff.relu(t);
-  t = ff.softmax(t);
-
-  Optimizer *optimizer = new SGDOptimizer(&ff, 0.001f);
-  std::vector<MetricsType> metrics;
-  metrics.push_back(METRICS_ACCURACY);
-  metrics.push_back(METRICS_SPARSE_CATEGORICAL_CROSSENTROPY);
-  ff.compile(optimizer, LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, metrics);
-  ff.init_operators();
-  {
-    runtime->issue_execution_fence(ctx);
-    TimingLauncher timer(MEASURE_MICRO_SECONDS);
-    Future future = runtime->issue_timing_measurement(ctx, timer);
-    future.get_void_result();
-  }
-  double ts_start = Realm::Clock::current_time_in_microseconds();
-  for (int epoch = 0; epoch < ffConfig.epochs; epoch++) {
-    ff.reset_metrics();
-    int iterations = 128; // data_loader.num_samples / ffConfig.batchSize;
-
-    for (int iter = 0; iter < iterations; iter++) {
-      runtime->begin_trace(ctx, 111 /*trace_id*/);
-      ff.forward();
-      ff.zero_gradients();
-      ff.backward();
-      ff.update();
-      runtime->end_trace(ctx, 111 /*trace_id*/);
-    }
-  }
-  // End timer
-  {
-    runtime->issue_execution_fence(ctx);
-    TimingLauncher timer(MEASURE_MICRO_SECONDS);
-    Future future = runtime->issue_timing_measurement(ctx, timer);
-    future.get_void_result();
-  }
-  double ts_end = Realm::Clock::current_time_in_microseconds();
-  double run_time = 1e-6 * (ts_end - ts_start);
-  printf("ELAPSED TIME = %.4fs, THROUGHPUT = %.2f samples/s\n",
-         run_time,
-         128 * ffConfig.batchSize * ffConfig.epochs / run_time);
-}
-
-void FlexFlow::register_custom_tasks() {}
diff --git a/examples/cpp/split_test_2/CMakeLists.txt b/examples/cpp/split_test_2/CMakeLists.txt
deleted file mode 100644
index 5e50102000..0000000000
--- a/examples/cpp/split_test_2/CMakeLists.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-cmake_minimum_required(VERSION 3.10)
-
-project(FlexFlowExample_split_test_2)
-set(project_target split_test_2)
-
-set(CPU_SRC
-  ${FLEXFLOW_CPP_DRV_SRC}
-  split_test_2.cc)
-
-cuda_add_executable(${project_target} ${CPU_SRC})
-target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR})
-target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES})
-
-set(BIN_DEST "bin")
-install(TARGETS ${project_target} DESTINATION ${BIN_DEST})
diff --git a/examples/cpp/split_test_2/split_test_2.cc b/examples/cpp/split_test_2/split_test_2.cc
deleted file mode 100644
index 69385d14cb..0000000000
--- a/examples/cpp/split_test_2/split_test_2.cc
+++ /dev/null
@@ -1,94 +0,0 @@
-#include "flexflow/model.h"
-#include "flexflow/substitution.h"
-
-namespace FlexFlow {
-
-using namespace Legion;
-using namespace FlexFlow;
-using FlexFlow::PCG::Graph;
-using FlexFlow::PCG::GraphSearchHelper;
-using FlexFlow::PCG::Node;
-
-LegionRuntime::Logger::Category log_app("split_test_2");
-
-void top_level_task(Task const *task,
-                    std::vector<PhysicalRegion> const &regions,
-                    Context ctx,
-                    Runtime *runtime) {
-  FFConfig ffConfig;
-  log_app.print("batchSize(%d) workersPerNodes(%d) numNodes(%d)",
-                ffConfig.batchSize,
-                ffConfig.workersPerNode,
-                ffConfig.numNodes);
-  FFModel ff(ffConfig);
-
-  Tensor input;
-  {
-    int const dims[] = {ffConfig.batchSize, 4, 32, 32};
-    input = ff.create_tensor<4>(dims, DT_FLOAT);
-    log_app.print(
-        "input size: %d %d %d %d", dims[0], dims[1], dims[2], dims[3]);
-  }
-
-  Tensor t, t1, t2;
-
-  // int channels[] = { 4, 8, 16, 32, 64, 128, 256, 512  };
-  int channels[] = {4, 8, 16};
-
-  t = input;
-  for (int i = 0; i < sizeof(channels) / sizeof(int); i++) {
-    t = ff.conv2d(t, channels[1], 3, 3, 2, 2, 0, 0);
-    std::ostringstream oss;
-    oss << "Iteration " << i;
-    t->print(oss.str());
-  }
-  t->print("Post-conv shape");
-  t = ff.flat(t);
-  t = ff.relu(t);
-  t = ff.softmax(t);
-
-  Optimizer *optimizer = new SGDOptimizer(&ff, 0.001f);
-  std::vector<MetricsType> metrics;
-  metrics.push_back(METRICS_ACCURACY);
-  metrics.push_back(METRICS_SPARSE_CATEGORICAL_CROSSENTROPY);
-  ff.compile(optimizer, LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, metrics);
-  GraphSearchHelper gsh(&ff);
-  std::unique_ptr<Graph> best_graph;
-  std::unordered_map<Node, MachineView> optimal_views;
-  gsh.graph_optimize(10, false, best_graph, optimal_views);
-  // {
-  // runtime->issue_execution_fence(ctx);
-  // TimingLauncher timer(MEASURE_MICRO_SECONDS);
-  // Future future = runtime->issue_timing_measurement(ctx, timer);
-  // future.get_void_result();
-  // }
-  // double ts_start = Realm::Clock::current_time_in_microseconds();
-  // for (int epoch = 0; epoch < ffConfig.epochs; epoch++) {
-  // ff.reset_metrics();
-  // int iterations = 128; // data_loader.num_samples / ffConfig.batchSize;
-  //
-  // for (int iter = 0; iter < iterations; iter++) {
-  // runtime->begin_trace(ctx, 111/*trace_id*/);
-  // ff.forward();
-  // ff.zero_gradients();
-  // ff.backward();
-  // ff.update();
-  // runtime->end_trace(ctx, 111/*trace_id*/);
-  // }
-  // }
-  // End timer
-  // {
-  // runtime->issue_execution_fence(ctx);
-  // TimingLauncher timer(MEASURE_MICRO_SECONDS);
-  // Future future = runtime->issue_timing_measurement(ctx, timer);
-  // future.get_void_result();
-  // }
-  // double ts_end = Realm::Clock::current_time_in_microseconds();
-  // double run_time = 1e-6 * (ts_end - ts_start);
-  // printf("ELAPSED TIME = %.4fs, THROUGHPUT = %.2f samples/s\n", run_time,
-  //  128 * ffConfig.batchSize * ffConfig.epochs / run_time);
-}
-
-void register_custom_tasks() {}
-
-} // namespace FlexFlow
diff --git a/examples/python/keras/accuracy.py b/examples/python/keras/accuracy.py
deleted file mode 100644
index 30b15402f4..0000000000
--- a/examples/python/keras/accuracy.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from enum import Enum
-
-class ModelAccuracy(Enum):
-  MNIST_MLP = 90
-  MNIST_CNN = 90
-  REUTERS_MLP = 90
-  CIFAR10_CNN = 90
-  CIFAR10_ALEXNET = 90
diff --git a/examples/python/keras/callback.py b/examples/python/keras/callback.py
deleted file mode 100644
index f4ebc03d17..0000000000
--- a/examples/python/keras/callback.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from flexflow.keras.models import Model, Sequential
-from flexflow.keras.layers import Input, Flatten, Dense, Activation, Conv2D, MaxPooling2D, Concatenate, concatenate
-import flexflow.keras.optimizers
-from flexflow.keras.callbacks import Callback, LearningRateScheduler, VerifyMetrics, EpochVerifyMetrics
-from flexflow.keras.datasets import cifar10
-from flexflow.keras import backend as K
-from accuracy import ModelAccuracy
-
-import numpy as np
-
-def lr_scheduler(epoch):
-  if epoch == 0:
-    return 0.01
-  else:
-    return 0.02
-
-def top_level_task():
-  print(K.backend())
-  
-  num_classes = 10
-  
-  num_samples = 10000
-  
-  (x_train, y_train), (x_test, y_test) = cifar10.load_data(num_samples)
-  
-  x_train = x_train.astype('float32')
-  x_train /= 255
-  y_train = y_train.astype('int32')
-  print("shape: ", x_train.shape)
-  
-  input_tensor1 = Input(shape=(3, 32, 32), dtype="float32")
-  
-  output_tensor = Conv2D(filters=32, input_shape=(3,32,32), kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu")(input_tensor1)
-  output_tensor = Conv2D(filters=32, kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu")(output_tensor)
-  output_tensor = MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="valid")(output_tensor)
-  output_tensor = Conv2D(filters=64, kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu")(output_tensor)
-  output_tensor = Conv2D(filters=64, kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu")(output_tensor)
-  output_tensor = MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="valid")(output_tensor)
-  output_tensor = Flatten()(output_tensor)
-  output_tensor = Dense(512, activation="relu")(output_tensor)
-  output_tensor = Dense(num_classes)(output_tensor)
-  output_tensor = Activation("softmax")(output_tensor)
-
-  model = Model(input_tensor1, output_tensor)
-  
-  opt = flexflow.keras.optimizers.SGD(learning_rate=0.02)
-  model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])
-  print(model.summary())
-  
-  mylr_scheduler = LearningRateScheduler(lr_scheduler)
-
-  model.fit(x_train, y_train, epochs=80, callbacks=[mylr_scheduler, VerifyMetrics(ModelAccuracy.CIFAR10_CNN), EpochVerifyMetrics(ModelAccuracy.CIFAR10_CNN)])
-
-if __name__ == "__main__":
-  print("Functional API, cifar10 cnn callback")
-  top_level_task()
diff --git a/examples/python/keras/candle_uno/candle_uno.py b/examples/python/keras/candle_uno/candle_uno.py
deleted file mode 100644
index 4c814666e7..0000000000
--- a/examples/python/keras/candle_uno/candle_uno.py
+++ /dev/null
@@ -1,216 +0,0 @@
-from flexflow.keras.models import Model, Sequential
-from flexflow.keras.layers import Add, Subtract, Flatten, Dense, Activation, Conv2D, MaxPooling2D, Concatenate, add, subtract, Input
-import flexflow.keras.optimizers
-
-from flexflow.core import *
-import uno as benchmark
-from default_utils import finalize_parameters
-from uno_data import CombinedDataLoader, CombinedDataGenerator, DataFeeder
-
-import pandas as pd
-import numpy as np
-
-def initialize_parameters(default_model='uno_default_model.txt'):
-  # Build benchmark object
-  unoBmk = benchmark.BenchmarkUno(benchmark.file_path, default_model, 'keras',
-                                  prog='uno_baseline', desc='Build neural network based models to predict tumor response to single and paired drugs.')
-  # Initialize parameters
-  gParameters = finalize_parameters(unoBmk)
-  # benchmark.logger.info('Params: {}'.format(gParameters))
-  return gParameters
-
-class Struct:
-  def __init__(self, **entries):
-    self.__dict__.update(entries)
-
-def build_feature_model(input_shape, name='', dense_layers=[1000, 1000],
-                        activation='relu', residual=False,
-                        dropout_rate=0, permanent_dropout=True):
-  print('input_shape', input_shape)
-  x_input = Input(shape=input_shape)
-  h = x_input
-  for i, layer in enumerate(dense_layers):
-    x = h
-    h = Dense(layer, activation=activation)(h)
-    if dropout_rate > 0:
-      if permanent_dropout:
-        h = PermanentDropout(dropout_rate)(h)
-      else:
-        h = Dropout(dropout_rate)(h)
-    if residual:
-      try:
-        h = Add([h, x])
-      except ValueError:
-        pass
-  model = Model(x_input, h, name=name)
-  return model
-
-def build_model(loader, args, permanent_dropout=True, silent=False):
-  input_shapes = {}
-  dropout_rate = args.dropout
-  for fea_type, shape in loader.feature_shapes.items():
-    base_type = fea_type.split('.')[0]
-    if base_type in ['cell', 'drug']:
-      if not silent:
-        print('Feature encoding submodel for %s:', fea_type)
-        #box.summary(print_fn=logger.debug)
-      input_shapes[fea_type] = shape
-
-  inputs = []
-  encoded_inputs = []
-  for fea_name, fea_type in loader.input_features.items():
-    shape = loader.feature_shapes[fea_type]
-    fea_input = Input(shape, name='input.' + fea_name)
-    inputs.append(fea_input)
-    if fea_type in input_shapes:
-      if args.dense_cell_feature_layers is not None and base_type == 'cell':
-        dense_feature_layers = args.dense_cell_feature_layers
-      elif args.dense_drug_feature_layers is not None and base_type == 'drug':
-        dense_feature_layers = args.dense_drug_feature_layers
-      else:
-        dense_feature_layers = args.dense_feature_layers
-      input_model = build_feature_model(input_shape=shape, name=fea_type,
-                                        dense_layers=dense_feature_layers,
-                                        dropout_rate=dropout_rate, permanent_dropout=permanent_dropout)
-      encoded = input_model(fea_input)
-    else:
-      encoded = fea_input
-    encoded_inputs.append(encoded)
-
-  merged = Concatenate(axis=1)(encoded_inputs)
-
-  h = merged
-  for i, layer in enumerate(args.dense):
-    x = h
-    h = Dense(layer, activation=args.activation)(h)
-    if dropout_rate > 0:
-      if permanent_dropout:
-        h = PermanentDropout(dropout_rate)(h)
-      else:
-        h = Dropout(dropout_rate)(h)
-    if args.residual:
-      try:
-        h = Add([h, x])
-      except ValueError:
-        pass
-  output = Dense(1)(h)
-
-  return Model(inputs, output)
-
-def top_level_task():
-  params = initialize_parameters()
-  args = Struct(**params)
-  ffconfig = FFConfig()
-  ffmodel = FFModel(ffconfig)
-  loader = CombinedDataLoader(seed=args.rng_seed)
-  print(loader)
-  loader.load(cache=args.cache,
-              ncols=args.feature_subsample,
-              agg_dose=args.agg_dose,
-              cell_features=args.cell_features,
-              drug_features=args.drug_features,
-              drug_median_response_min=args.drug_median_response_min,
-              drug_median_response_max=args.drug_median_response_max,
-              use_landmark_genes=args.use_landmark_genes,
-              use_filtered_genes=args.use_filtered_genes,
-              cell_feature_subset_path=args.cell_feature_subset_path or args.feature_subset_path,
-              drug_feature_subset_path=args.drug_feature_subset_path or args.feature_subset_path,
-              preprocess_rnaseq=args.preprocess_rnaseq,
-              single=args.single,
-              train_sources=args.train_sources,
-              test_sources=args.test_sources,
-              embed_feature_source=not args.no_feature_source,
-              encode_response_source=not args.no_response_source,
-              use_exported_data=args.use_exported_data,
-              )
-
-  target = args.agg_dose or 'Growth'
-  val_split = args.val_split
-  train_split = 1 - val_split
-
-  if args.export_csv:
-    fname = args.export_csv
-    loader.partition_data(cv_folds=args.cv, train_split=train_split, val_split=val_split,
-                          cell_types=args.cell_types, by_cell=args.by_cell, by_drug=args.by_drug,
-                          cell_subset_path=args.cell_subset_path, drug_subset_path=args.drug_subset_path)
-    train_gen = CombinedDataGenerator(loader, batch_size=args.batch_size, shuffle=args.shuffle)
-    val_gen = CombinedDataGenerator(loader, partition='val', batch_size=args.batch_size, shuffle=args.shuffle)
-
-    x_train_list, y_train = train_gen.get_slice(size=train_gen.size, dataframe=True, single=args.single)
-    x_val_list, y_val = val_gen.get_slice(size=val_gen.size, dataframe=True, single=args.single)
-    df_train = pd.concat([y_train] + x_train_list, axis=1)
-    df_val = pd.concat([y_val] + x_val_list, axis=1)
-    df = pd.concat([df_train, df_val]).reset_index(drop=True)
-    if args.growth_bins > 1:
-      df = uno_data.discretize(df, 'Growth', bins=args.growth_bins)
-    df.to_csv(fname, sep='\t', index=False, float_format="%.3g")
-    return
-
-  if args.export_data:
-    fname = args.export_data
-    loader.partition_data(cv_folds=args.cv, train_split=train_split, val_split=val_split,
-                          cell_types=args.cell_types, by_cell=args.by_cell, by_drug=args.by_drug,
-                          cell_subset_path=args.cell_subset_path, drug_subset_path=args.drug_subset_path)
-    train_gen = CombinedDataGenerator(loader, batch_size=args.batch_size, shuffle=args.shuffle)
-    val_gen = CombinedDataGenerator(loader, partition='val', batch_size=args.batch_size, shuffle=args.shuffle)
-    store = pd.HDFStore(fname, complevel=9, complib='blosc:snappy')
-
-    config_min_itemsize = {'Sample': 30, 'Drug1': 10}
-    if not args.single:
-      config_min_itemsize['Drug2'] = 10
-
-    for partition in ['train', 'val']:
-      gen = train_gen if partition == 'train' else val_gen
-      for i in range(gen.steps):
-        x_list, y = gen.get_slice(size=args.batch_size, dataframe=True, single=args.single)
-        for j, input_feature in enumerate(x_list):
-          input_feature.columns = [''] * len(input_feature.columns)
-          store.append('x_{}_{}'.format(partition, j), input_feature.astype('float32'), format='table', data_column=True)
-        store.append('y_{}'.format(partition), y.astype({target: 'float32'}), format='table', data_column=True,
-                     min_itemsize=config_min_itemsize)
-        print('Generating {} dataset. {} / {}'.format(partition, i, gen.steps))
-
-    # save input_features and feature_shapes from loader
-    store.put('model', pd.DataFrame())
-    store.get_storer('model').attrs.input_features = loader.input_features
-    store.get_storer('model').attrs.feature_shapes = loader.feature_shapes
-
-    store.close()
-    print('Completed generating {}'.format(fname))
-    return
-
-  if args.use_exported_data is None:
-    loader.partition_data(cv_folds=args.cv, train_split=train_split, val_split=val_split,
-                          cell_types=args.cell_types, by_cell=args.by_cell, by_drug=args.by_drug,
-                          cell_subset_path=args.cell_subset_path, drug_subset_path=args.drug_subset_path)
-
-  model = build_model(loader, args)
-  print(model.summary())
-  opt = flexflow.keras.optimizers.SGD()
-  model.compile(optimizer=opt, loss='mean_squared_error', metrics=['accuracy', 'mean_squared_error'])
-
-  if args.use_exported_data is not None:
-    train_gen = DataFeeder(filename=args.use_exported_data, batch_size=args.batch_size, shuffle=args.shuffle, single=args.single, agg_dose=args.agg_dose)
-    val_gen = DataFeeder(partition='val', filename=args.use_exported_data, batch_size=args.batch_size, shuffle=args.shuffle, single=args.single, agg_dose=args.agg_dose)
-    test_gen = DataFeeder(partition='test', filename=args.use_exported_data, batch_size=args.batch_size, shuffle=args.shuffle, single=args.single, agg_dose=args.agg_dose)
-  else:
-    train_gen = CombinedDataGenerator(loader, fold=fold, batch_size=args.batch_size, shuffle=args.shuffle, single=args.single)
-    val_gen = CombinedDataGenerator(loader, partition='val', fold=fold, batch_size=args.batch_size, shuffle=args.shuffle, single=args.single)
-    test_gen = CombinedDataGenerator(loader, partition='test', fold=fold, batch_size=args.batch_size, shuffle=args.shuffle, single=args.single)
-
-  if args.no_gen:
-    x_train_list, y_train = train_gen.get_slice(size=train_gen.size, single=args.single)
-    model.fit(x_train_list, y_train, batch_size=args.batch_size, epochs=args.epochs)
-  else:
-    x_train_list, y_train = train_gen.getall()
-    x_train_list_np = []
-    for x_train in x_train_list:
-      x_train_np = np.ascontiguousarray(x_train.to_numpy())
-      x_train_list_np.append(x_train_np)
-    y_train_np = np.ascontiguousarray(y_train.to_numpy())
-    y_train_np = np.reshape(y_train_np, (-1, 1))
-    model.fit(x_train_list_np, y_train_np, batch_size=args.batch_size, epochs=args.epochs)
-
-if __name__ == "__main__":
-  print("candle uno")
-  top_level_task()
diff --git a/examples/python/keras/candle_uno/default_utils.py b/examples/python/keras/candle_uno/default_utils.py
deleted file mode 100644
index 64c42f0ae7..0000000000
--- a/examples/python/keras/candle_uno/default_utils.py
+++ /dev/null
@@ -1,1016 +0,0 @@
-from __future__ import absolute_import
-
-import numpy as np
-import random
-from pprint import pprint
-import inspect
-
-import logging
-import warnings
-
-import os
-import sys
-import gzip
-import argparse
-try:
-    import configparser
-except ImportError:
-    import ConfigParser as configparser
-
-file_path = os.path.dirname(os.path.realpath(__file__))
-lib_path = os.path.abspath(os.path.join(file_path, '..', '..', 'common'))
-sys.path.append(lib_path)
-
-work_path = os.path.dirname(os.path.realpath(__file__))
-
-from file_utils import get_file
-
-# Seed for random generation -- default value
-DEFAULT_SEED = 7102
-DEFAULT_TIMEOUT = -1 # no timeout
-DEFAULT_DATATYPE = np.float32
-
-
-PARAMETERS_CANDLE = [
-                  'config_file', 
-                  # neon parser
-                  'verbose', 'logfile', 'save_path', 'model_name', 'data_type', 'dense', 'rng_seed', 'epochs', 'batch_size', 
-                  # general behavior
-                  'train_bool', 'eval_bool', 'timeout', 
-                  # logging
-                  'home_dir', 'train_data', 'test_data', 'output_dir', 'data_url', 'experiment_id', 'run_id', 
-                  # model architecture
-                  'conv', 'locally_connected', 'activation', 'out_activation', 'lstm_size', 'recurrent_dropout', 
-                  # processing between layers
-                  'dropout', 'pool', 'batch_normalization', 
-                  # model evaluation
-                  'loss', 'optimizer', 'metrics', 
-                  # data preprocessing
-                  'scaling', 'shuffle', 'feature_subsample', 
-                  # training
-                  'learning_rate', 'early_stop', 'momentum', 'initialization', 
-                  'val_split', 'train_steps', 'val_steps', 'test_steps', 'train_samples', 'val_samples', 
-                  # backend
-                  'gpus', 
-                  # profiling
-                  'profiling',
-                  # cyclic learning rate
-                  'clr_flag', 'clr_mode', 'clr_base_lr', 'clr_max_lr', 'clr_gamma'
-                  ]
-
-CONFLICT_LIST = [
-    ['clr_flag','warmup_lr'],
-    ['clr_flag','reduce_lr']
-]
-
-def check_flag_conflicts(params):
-    key_set = set(params.keys())
-    # check for conflicts
-    #conflict_flag = False
-    # loop over each set of mutually exclusive flags
-    # if any set conflicts exit program 
-    for flag_list in CONFLICT_LIST:
-        flag_count = 0
-        for i in flag_list:
-            if i in key_set:
-                if params[i] is True:
-                    flag_count +=1
-        if flag_count > 1 :
-            raise Exception('ERROR ! Conflict in flag specification. ' \
-                        'These flags should not be used together: ' + str(sorted(flag_list)) + \
-                            '... Exiting')
-            #print("Warning: conflicting flags in ", flag_list)
-            #exit()
-
-#### IO UTILS
-
-def fetch_file(link, subdir, untar=False, md5_hash=None):
-    """ Convert URL to file path and download the file
-        if it is not already present in spedified cache.
-
-        Parameters
-        ----------
-        link : link path
-            URL of the file to download
-        subdir : directory path
-            Local path to check for cached file.
-        untar : boolean
-            Flag to specify if the file to download should
-            be decompressed too.
-            (default: False, no decompression)
-        md5_hash : MD5 hash
-            Hash used as a checksum to verify data integrity.
-            Verification is carried out if a hash is provided.
-            (default: None, no verification)
-
-        Return
-        ----------
-        local path to the downloaded, or cached, file.
-    """
-
-    fname = os.path.basename(link)
-    return get_file(fname, origin=link, untar=untar, md5_hash=md5_hash, cache_subdir=subdir)
-
-def verify_path(path):
-    """ Verify if a directory path exists locally. If the path
-        does not exist, but is a valid path, it recursivelly creates
-        the specified directory path structure.
-
-        Parameters
-        ----------
-        path : directory path
-            Description of local directory path
-    """
-    folder = os.path.dirname(path)
-    if folder and not os.path.exists(folder):
-        os.makedirs(folder)
-
-
-def set_up_logger(logfile, logger, verbose):
-    """ Set up the event logging system. Two handlers are created.
-        One to send log records to a specified file and
-        one to send log records to the (defaulf) sys.stderr stream.
-        The logger and the file handler are set to DEBUG logging level.
-        The stream handler is set to INFO logging level, or to DEBUG
-        logging level if the verbose flag is specified.
-        Logging messages which are less severe than the level set will
-        be ignored.
-
-        Parameters
-        ----------
-        logfile : filename
-            File to store the log records
-        logger : logger object
-            Python object for the logging interface
-        verbose : boolean
-            Flag to increase the logging level from INFO to DEBUG. It 
-            only applies to the stream handler.
-    """
-    verify_path(logfile)
-    fh = logging.FileHandler(logfile)
-    fh.setFormatter(logging.Formatter("[%(asctime)s %(process)d] %(message)s", datefmt="%Y-%m-%d %H:%M:%S"))
-    fh.setLevel(logging.DEBUG)
-
-    sh = logging.StreamHandler()
-    sh.setFormatter(logging.Formatter(''))
-    sh.setLevel(logging.DEBUG if verbose else logging.INFO)
-
-    logger.setLevel(logging.DEBUG)
-    logger.addHandler(fh)
-    logger.addHandler(sh)
-
-
-#### REFORMATING UTILS
-
-
-def eval_string_as_list(str_read, separator, dtype):
-    """ Parse a string and convert it into a list of lists.
-
-        Parameters
-        ----------
-        str_read : string
-            String read (from configuration file or command line, for example)
-        separator : character
-            Character that specifies the separation between the lists
-        dtype : data type
-            Data type to decode the elements of the list
-
-        Return
-        ----------
-        decoded_list : list
-            List extracted from string and with elements of the
-            specified type.
-    """
-
-    # Figure out desired type
-    ldtype = dtype
-    if ldtype is None:
-        ldtype = np.int32
-
-    # Split list
-    decoded_list = []
-    out_list = str_read.split(separator)
-
-    # Convert to desired type
-    for el in out_list:
-        decoded_list.append( ldtype( el ) )
-
-    return decoded_list
-
-
-
-def eval_string_as_list_of_lists(str_read, separator_out, separator_in, dtype):
-    """ Parse a string and convert it into a list of lists.
-
-        Parameters
-        ----------
-        str_read : string
-            String read (from configuration file or command line, for example)
-        separator_out : character
-            Character that specifies the separation between the outer level lists
-        separator_in : character
-            Character that specifies the separation between the inner level lists
-        dtype : data type
-            Data type to decode the elements of the lists
-
-        Return
-        ----------
-        decoded_list : list
-            List of lists extracted from string and with elements of the specified type.
-    """
-
-    # Figure out desired type
-    ldtype = dtype
-    if ldtype is None:
-        ldtype = np.int32
-
-    # Split outer list
-    decoded_list = []
-    out_list = str_read.split(separator_out)
-    # Split each internal list
-    for l in out_list:
-        in_list = []
-        elem = l.split(separator_in)
-        # Convert to desired type
-        for el in elem:
-            in_list.append( ldtype( el ) )
-        decoded_list.append( in_list )
-
-    return decoded_list
-
-
-def str2bool(v):
-    """This is taken from:
-        https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse
-        Because type=bool is not interpreted as a bool and action='store_true' cannot be
-        undone.
-
-        Parameters
-        ----------
-        v : string
-            String to interpret
-
-        Return
-        ----------
-        Boolean value. It raises and exception if the provided string cannot \
-        be interpreted as a boolean type.
-        Strings recognized as boolean True : 
-            'yes', 'true', 't', 'y', '1' and uppercase versions (where applicable).
-        Strings recognized as boolean False : 
-            'no', 'false', 'f', 'n', '0' and uppercase versions (where applicable).
-    """
-    if v.lower() in ('yes', 'true', 't', 'y', '1'):
-        return True
-    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
-        return False
-    else:
-        raise argparse.ArgumentTypeError('Boolean value expected.')
-
-#### CLASS DEFINITIONS
-
-class ArgumentStruct:
-    """Class that converts a python dictionary into an object with
-       named entries given by the dictionary keys.
-       This structure simplifies the calling convention for accessing
-       the dictionary values (corresponding to problem parameters).
-       After the object instantiation both modes of access (dictionary
-       or object entries) can be used.
-    """
-    def __init__(self, **entries):
-        self.__dict__.update(entries)
-
-
-
-class ListOfListsAction(argparse.Action):
-    """This class extends the argparse.Action class by instantiating
-        an argparser that constructs a list-of-lists from an input
-        (command-line option or argument) given as a string.
-    """
-    def __init__(self, option_strings, dest, type, **kwargs):
-        """Initialize a ListOfListsAction object. If no type is specified,
-           an integer is assumed by default as the type for the elements
-           of the list-of-lists.
-
-           Parameters
-           ----------
-           option_strings : string
-               String to parse
-           dest : object
-               Object to store the output (in this case the parsed list-of-lists).
-           type : data type
-               Data type to decode the elements of the lists.
-               Defaults to np.int32.
-           kwargs : object
-               Python object containing other argparse.Action parameters.
-
-        """
-
-        super(ListOfListsAction, self).__init__(option_strings, dest, **kwargs)
-        self.dtype = type
-        if self.dtype is None:
-            self.dtype = np.int32
-
-
-
-    def __call__(self, parser, namespace, values, option_string=None):
-        """This function overrides the __call__ method of the base
-           argparse.Action class.
-
-           This function implements the action of the ListOfListAction
-           class by parsing an input string (command-line option or argument)
-           and maping it into a list-of-lists. The resulting list-of-lists is
-           added to the namespace of parsed arguments. The parsing assumes that
-           the separator between lists is a colon ':' and the separator inside
-           the list is a comma ','. The values of the list are casted to the
-           type specified at the object initialization.
-
-           Parameters
-           ----------
-           parser : ArgumentParser object
-               Object that contains this action
-           namespace : Namespace object
-               Namespace object that will be returned by the parse_args()
-               function.
-           values : string
-               The associated command-line arguments converted to string type
-               (i.e. input).
-           option_string : string
-               The option string that was used to invoke this action. (optional)
-
-        """
-
-        decoded_list = []
-        removed1 = values.replace('[', '')
-        removed2 = removed1.replace(']', '')
-        out_list = removed2.split(':')
-
-        for l in out_list:
-            in_list = []
-            elem = l.split(',')
-            for el in elem:
-                in_list.append( self.dtype( el ) )
-            decoded_list.append( in_list )
-
-        setattr(namespace, self.dest, decoded_list)
-
-#### INITIALIZATION UTILS
-
-
-def set_seed(seed):
-    """Set the seed of the pseudo-random generator to the specified value.
-
-        Parameters
-        ----------
-        seed : int
-            Value to intialize or re-seed the generator.
-    """
-    os.environ['PYTHONHASHSEED'] = '0'
-    np.random.seed(seed)
-
-    random.seed(seed)
-
-def check_file_parameters_exists(params_parser, params_benchmark, params_file):
-    """Functionality to verify that the parameters defined in the configuraion file are recognizable by the command line parser (i.e. no uknown keywords are used in the configuration file).
- 
-    Parameters
-    ----------
-    params_parser : python dictionary
-        Includes parameters set via the command line.
-    params_benchmark : python list
-        Includes additional parameters defined in the benchmark.
-    params_file : python dictionary
-        Includes parameters read from the configuration file.
-
-        Global:
-        PARAMETERS_CANDLE : python list
-            Includes all the core keywords that are specified in CANDLE.
-    """
-    # Get keywords from arguments coming via command line (and CANDLE supervisor)
-    args_dict = vars(params_parser)
-    args_set = set(args_dict.keys())
-    # Get keywords from benchmark definition
-    bmk_keys = []
-    for item in params_benchmark:
-        bmk_keys.append( item['name'] )
-    bmk_set = set(bmk_keys)
-    # Get core CANDLE keywords
-    candle_set = set(PARAMETERS_CANDLE)
-    # Consolidate keywords from CANDLE core, command line, CANDLE supervisor and benchmark
-    candle_set = candle_set.union(args_set)
-    candle_set = candle_set.union(bmk_set)
-    # Get keywords used in config_file
-    file_set = set(params_file.keys())
-    # Compute keywords that come from the config_file that are not in the CANDLE specs
-    diff_set = file_set.difference(candle_set)
-
-    if ( len(diff_set) > 0 ):
-        message = 'These keywords used in the configuration file are not defined in CANDLE: ' + str(sorted(diff_set))
-        warnings.warn(message, RuntimeWarning)
-
-
-def finalize_parameters(bmk):
-    """Utility to parse parameters in common as well as parameters
-        particular to each benchmark.
-
-        Parameters
-        ----------
-        bmk : benchmark object
-            Object that has benchmark filepaths and specifications
-
-        Return
-        ----------
-        gParameters : python dictionary
-            Dictionary with all the parameters necessary to run the benchmark.
-            Command line overwrites config file specifications
-    """
-
-    # Parse common parameters
-    bmk.parse_from_common()
-    # Parse parameters that are applicable just to benchmark
-    bmk.parse_from_benchmark()
-
-    #print('Args:', args)
-    # Get parameters from configuration file
-    # Reads parameter subset, just checking if a config_file has been set
-    # by comand line (the parse_known_args() function allows a partial
-    # parsing)
-    aux = bmk.parser.parse_known_args()
-    try : # Try to get the 'config_file' option
-        conffile_txt = aux[0].config_file
-    except AttributeError: # The 'config_file' option was not set by command-line
-        conffile = bmk.conffile # use default file
-    else: # a 'config_file' has been set --> use this file
-        conffile = os.path.join(bmk.file_path, conffile_txt)
-
-    #print("Configuration file: ", conffile)
-    fileParameters = bmk.read_config_file(conffile)#aux.config_file)#args.config_file)
-    # Get command-line parameters
-    args, unkown = bmk.parser.parse_known_args()
-    print(unkown)
-    #print ('Params:', fileParameters)
-    # Check keywords from file against CANDLE common and module definitions
-    bmk_dict = bmk.additional_definitions
-    check_file_parameters_exists(args, bmk_dict, fileParameters)
-    # Consolidate parameter set. Command-line parameters overwrite file configuration
-    gParameters = args_overwrite_config(args, fileParameters)
-    # Check that required set of parameters has been defined
-    bmk.check_required_exists(gParameters)
-    print ('Params:')
-    pprint(gParameters)
-    # Check that no keywords conflict
-    check_flag_conflicts(gParameters)
-
-    return gParameters
-
-
-def get_default_neon_parser(parser):
-    """Parse command-line arguments that are default in neon parser (and are common to all frameworks). 
-        Ignore if not present.
-
-        Parameters
-        ----------
-        parser : ArgumentParser object
-            Parser for neon default command-line options
-    """
-    # Logging Level
-    parser.add_argument("-v", "--verbose", type=str2bool,
-                        help="increase output verbosity")
-    parser.add_argument("-l", "--log", dest='logfile',
-                        default=None,
-                        help="log file")
-
-    # Logging utilities
-    parser.add_argument("-s", "--save_path", dest='save_path',
-                        default=argparse.SUPPRESS, type=str,
-                        help="file path to save model snapshots")
-
-    # General behavior
-    parser.add_argument("--model_name", dest='model_name', type=str,
-                        default=argparse.SUPPRESS,
-                        help="specify model name to use when building filenames for saving")
-    parser.add_argument("-d", "--data_type", dest='data_type',
-                        default=argparse.SUPPRESS,
-                        choices=['f16', 'f32', 'f64'],
-                        help="default floating point")
-
-    # Model definition
-    # Model Architecture
-    parser.add_argument("--dense", nargs='+', type=int,
-                        default=argparse.SUPPRESS,
-                        help="number of units in fully connected layers in an integer array")
-
-    # Data preprocessing
-    #parser.add_argument("--shuffle", type=str2bool,
-    #                    default=True,
-    #                    help="randomly shuffle data set (produces different training and testing partitions each run depending on the seed)")
-
-    # Training configuration
-    parser.add_argument("-r", "--rng_seed", dest='rng_seed', type=int,
-                        default=argparse.SUPPRESS,
-                        help="random number generator seed")
-    parser.add_argument("-e", "--epochs", type=int,
-                        default=argparse.SUPPRESS,
-                        help="number of training epochs")
-    parser.add_argument("-z", "--batch_size", type=int,
-                        default=argparse.SUPPRESS,
-                        help="batch size")
-
-    return parser
-
-
-def get_common_parser(parser):
-    """Parse command-line arguments. Ignore if not present.
-
-        Parameters
-        ----------
-        parser : ArgumentParser object
-            Parser for command-line options
-    """
-
-    # Configuration file
-    parser.add_argument("--config_file", dest='config_file', default=argparse.SUPPRESS,
-        help="specify model configuration file")
-
-    # General behavior
-    parser.add_argument("--train_bool", dest='train_bool', type=str2bool,
-                        default=True,
-                        help="train model")
-    parser.add_argument("--eval_bool", dest='eval_bool', type=str2bool,
-                        default=argparse.SUPPRESS,
-                        help="evaluate model (use it for inference)")
-
-    parser.add_argument("--timeout", dest='timeout', type=int, action="store",
-                    default=argparse.SUPPRESS,
-                    help="seconds allowed to train model (default: no timeout)")
-
-
-    # Logging utilities
-    parser.add_argument("--home_dir", dest='home_dir',
-                        default=argparse.SUPPRESS, type=str,
-                        help="set home directory")
-
-    parser.add_argument("--train_data", action="store",
-                        default=argparse.SUPPRESS,
-                        help="training data filename")
-
-    parser.add_argument("--test_data", action="store",
-                        default=argparse.SUPPRESS,
-                        help="testing data filename")
-
-    parser.add_argument("--output_dir", dest='output_dir',
-                        default=argparse.SUPPRESS, type=str,
-                        help="output directory")
-
-    parser.add_argument("--data_url", dest='data_url',
-                        default=argparse.SUPPRESS, type=str,
-                        help="set data source url")
-
-    parser.add_argument("--experiment_id", default="EXP000", type=str, help="set the experiment unique identifier")
-
-    parser.add_argument("--run_id", default="RUN000", type=str, help="set the run unique identifier")
-
-
-
-    # Model definition
-    # Model Architecture
-    parser.add_argument("--conv", nargs='+', type=int,
-                        default=argparse.SUPPRESS,
-                        help="integer array describing convolution layers: conv1_filters, conv1_filter_len, conv1_stride, conv2_filters, conv2_filter_len, conv2_stride ...")
-    parser.add_argument("--locally_connected", type=str2bool,
-                        default=argparse.SUPPRESS,
-                        help="use locally connected layers instead of convolution layers")
-    parser.add_argument("-a", "--activation",
-                        default=argparse.SUPPRESS,
-                        help="keras activation function to use in inner layers: relu, tanh, sigmoid...")
-    parser.add_argument("--out_activation",
-                        default=argparse.SUPPRESS,
-                        help="keras activation function to use in out layer: softmax, linear, ...")
-
-
-    parser.add_argument("--lstm_size", nargs='+', type=int,
-                        default= argparse.SUPPRESS,
-                        help="integer array describing size of LSTM internal state per layer")
-    parser.add_argument("--recurrent_dropout", action="store",
-                        default=argparse.SUPPRESS, type=float,
-                        help="ratio of recurrent dropout")
-
-
-    # Processing between layers
-    parser.add_argument("--dropout", type=float,
-                        default=argparse.SUPPRESS,
-                        help="ratio of dropout used in fully connected layers")
-    parser.add_argument("--pool", type=int,
-                        default=argparse.SUPPRESS,
-                        help="pooling layer length")
-    parser.add_argument("--batch_normalization", type=str2bool,
-                        default=argparse.SUPPRESS,
-                        help="use batch normalization")
-
-    # Model Evaluation
-    parser.add_argument("--loss",
-                        default=argparse.SUPPRESS,
-                        help="keras loss function to use: mse, ...")
-    parser.add_argument("--optimizer",
-                        default=argparse.SUPPRESS,
-                        help="keras optimizer to use: sgd, rmsprop, ...")
-
-    parser.add_argument("--metrics",
-                        default=argparse.SUPPRESS,
-                        help="metrics to evaluate performance: accuracy, ...")
-
-    # Data preprocessing
-    parser.add_argument("--scaling",
-                        default=argparse.SUPPRESS,
-                        choices=['minabs', 'minmax', 'std', 'none'],
-                        help="type of feature scaling; 'minabs': to [-1,1]; 'minmax': to [0,1], 'std': standard unit normalization; 'none': no normalization")
-
-    parser.add_argument("--shuffle", type=str2bool, default=False,
-                        help="randomly shuffle data set (produces different training and testing partitions each run depending on the seed)")
-
-    # Feature selection
-    parser.add_argument("--feature_subsample", type=int,
-                        default=argparse.SUPPRESS,
-                        help="number of features to randomly sample from each category (cellline expression, drug descriptors, etc), 0 means using all features")
-
-    # Training configuration
-    parser.add_argument("--learning_rate",
-                        default= argparse.SUPPRESS, type=float,
-                        help="overrides the learning rate for training")
-    parser.add_argument("--early_stop", type=str2bool,
-                        default= argparse.SUPPRESS,
-                        help="activates keras callback for early stopping of training in function of the monitored variable specified")
-    parser.add_argument("--momentum",
-                        default= argparse.SUPPRESS, type=float,
-                        help="overrides the momentum to use in the SGD optimizer when training")
-
-    parser.add_argument("--initialization",
-                        default=argparse.SUPPRESS,
-                        choices=['constant', 'uniform', 'normal', 'glorot_uniform', 'lecun_uniform', 'he_normal'],
-                        help="type of weight initialization; 'constant': to 0; 'uniform': to [-0.05,0.05], 'normal': mean 0, stddev 0.05; 'glorot_uniform': [-lim,lim] with lim = sqrt(6/(fan_in+fan_out)); 'lecun_uniform' : [-lim,lim] with lim = sqrt(3/fan_in); 'he_normal' : mean 0, stddev sqrt(2/fan_in)")
-    parser.add_argument("--val_split", type=float,
-                        default=argparse.SUPPRESS,
-                        help="fraction of data to use in validation")
-    parser.add_argument("--train_steps", type=int,
-                        default=argparse.SUPPRESS,
-                        help="overrides the number of training batches per epoch if set to nonzero")
-    parser.add_argument("--val_steps", type=int,
-                        default=argparse.SUPPRESS,
-                        help="overrides the number of validation batches per epoch if set to nonzero")
-    parser.add_argument("--test_steps", type=int,
-                        default=argparse.SUPPRESS,
-                        help="overrides the number of test batches per epoch if set to nonzero")
-    parser.add_argument("--train_samples", action="store",
-                        default=argparse.SUPPRESS, type=int,
-                        help="overrides the number of training samples if set to nonzero")
-    parser.add_argument("--val_samples", action="store",
-                        default=argparse.SUPPRESS, type=int,
-                        help="overrides the number of validation samples if set to nonzero")
-
-
-    # Backend configuration
-    parser.add_argument("--gpus", nargs="*",
-                        default=argparse.SUPPRESS,
-                        #default=[0],
-                        type=int,
-                        help="set IDs of GPUs to use")
-
-    # profiling flags
-    parser.add_argument("-p", "--profiling", type=str2bool,
-                        default = 'false',
-                        help="Turn profiling on or off")
-
-    # cyclic learning rate
-    parser.add_argument("--clr_flag", 
-                        default=argparse.SUPPRESS,
-                        #default=None, 
-                        type=str2bool, 
-                        help="CLR flag (boolean)")
-    parser.add_argument("--clr_mode", 
-                        default=argparse.SUPPRESS,
-                        #default=None, 
-                        type=str, choices=['trng1', 'trng2', 'exp'], 
-                        help="CLR mode (default: trng1)")
-    parser.add_argument("--clr_base_lr", type=float, 
-                        default=argparse.SUPPRESS,
-                        #default=1e-4, 
-                        help="Base lr for cycle lr.")
-    parser.add_argument("--clr_max_lr", type=float, 
-                        default=argparse.SUPPRESS,
-                        #default=1e-3, 
-                        help="Max lr for cycle lr.")
-    parser.add_argument("--clr_gamma", type=float, 
-                        default=argparse.SUPPRESS,
-                        #default=0.999994, 
-                        help="Gamma parameter for learning cycle LR.")
-
-    return parser
-
-
-
-def args_overwrite_config(args, config):
-    """Overwrite configuration parameters with 
-        parameters specified via command-line.
-
-        Parameters
-        ----------
-        args : ArgumentParser object
-            Parameters specified via command-line
-        config : python dictionary
-            Parameters read from configuration file
-    """
-
-    params = config
-
-    args_dict = vars(args)
-
-    for key in args_dict.keys():
-        params[key] = args_dict[key]
-
-
-    if 'data_type' not in params:
-        params['data_type'] = DEFAULT_DATATYPE
-    else:
-        if params['data_type'] in set(['f16', 'f32', 'f64']):
-            params['data_type'] = get_choice(params['datatype'])
-
-    if 'output_dir' not in params:
-        params['output_dir'] = directory_from_parameters(params)
-    else:
-        params['output_dir'] = directory_from_parameters(params, params['output_dir'])
-
-    if 'rng_seed' not in params:
-        params['rng_seed'] = DEFAULT_SEED
-
-    if 'timeout' not in params:
-        params['timeout'] = DEFAULT_TIMEOUT
-
-
-    return params
-
-
-
-def get_choice(name):
-    """ Maps name string to the right type of argument
-    """
-    mapping = {}
-
-    # dtype
-    mapping['f16'] = np.float16
-    mapping['f32'] = np.float32
-    mapping['f64'] = np.float64
-
-    mapped = mapping.get(name)
-    if not mapped:
-        raise Exception('No mapping found for "{}"'.format(name))
-
-    return mapped
-
-
-def directory_from_parameters(params, commonroot='Output'):
-    """ Construct output directory path with unique IDs from parameters
-
-        Parameters
-        ----------
-        params : python dictionary
-            Dictionary of parameters read
-        commonroot : string
-            String to specify the common folder to store results.
-
-    """
-
-    if commonroot in set(['.', './']): # Same directory --> convert to absolute path
-        outdir = os.path.abspath('.')
-    else: # Create path specified
-        outdir = os.path.abspath(os.path.join('.', commonroot))
-        if not os.path.exists(outdir):
-            os.makedirs(outdir)
-
-        outdir = os.path.abspath(os.path.join(outdir, params['experiment_id']))
-        if not os.path.exists(outdir):
-            os.makedirs(outdir)
-
-        outdir = os.path.abspath(os.path.join(outdir, params['run_id']))
-        if not os.path.exists(outdir):
-            os.makedirs(outdir)
-
-
-    return outdir
-
-
-
-class Benchmark:
-    """ Class that implements an interface to handle configuration options for the
-        different CANDLE benchmarks.
-        It provides access to all the common configuration
-        options and configuration options particular to each individual benchmark.
-        It describes what minimum requirements should be specified to instantiate
-        the corresponding benchmark.
-        It interacts with the argparser to extract command-line options and arguments
-        from the benchmark's configuration files.
-    """
-
-    def __init__(self, filepath, defmodel, framework, prog=None, desc=None, parser=None):
-        """ Initialize Benchmark object.
-
-            Parameters
-            ----------
-            filepath : ./
-                os.path.dirname where the benchmark is located. Necessary to locate utils and
-                establish input/ouput paths
-            defmodel : 'p*b*_default_model.txt'
-                string corresponding to the default model of the benchmark
-            framework : 'keras', 'neon', 'mxnet', 'pytorch'
-                framework used to run the benchmark
-            prog : 'p*b*_baseline_*'
-                string for program name (usually associated to benchmark and framework)
-            desc : ' '
-                string describing benchmark (usually a description of the neural network model built)
-            parser : argparser (default None)
-                if 'neon' framework a NeonArgparser is passed. Otherwise an argparser is constructed.
-        """
-
-        if parser is None:
-            parser = argparse.ArgumentParser(prog=prog, formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=desc, conflict_handler='resolve')
-
-        self.parser = parser
-        self.file_path = filepath
-        self.default_model = defmodel
-        self.framework = framework
-
-        self.required = set([])
-        self.additional_definitions = []
-        self.set_locals()
-
-
-
-    def parse_from_common(self):
-        """Functionality to parse options common
-           for all benchmarks.
-           This functionality is based on methods 'get_default_neon_parser' and
-           'get_common_parser' which are defined previously(above). If the order changes
-           or they are moved, the calling has to be updated.
-        """
-
-
-        # Parse has been split between arguments that are common with the default neon parser
-        # and all the other options
-        parser = self.parser
-        if self.framework is not 'neon':
-            parser = get_default_neon_parser(parser)
-        parser = get_common_parser(parser)
-
-        self.parser = parser
-
-        # Set default configuration file
-        self.conffile = os.path.join(self.file_path, self.default_model)
-
-
-    def parse_from_benchmark(self):
-        """Functionality to parse options specific
-           specific for each benchmark.
-        """
-
-        for d in self.additional_definitions:
-            if 'type' not in d:
-                d['type'] = None
-            if 'default' not in d:
-                d['default'] = argparse.SUPPRESS
-            if 'help' not in d:
-                d['help'] = ''
-            if 'action' in d: # Actions
-                if d['action'] == 'list-of-lists': # Non standard. Specific functionallity has been added
-                    d['action'] = ListOfListsAction
-                    self.parser.add_argument('--' + d['name'], dest=d['name'], action=d['action'], type=d['type'], default=d['default'], help=d['help'])
-                elif (d['action'] == 'store_true') or (d['action'] == 'store_false'):
-                    raise Exception ('The usage of store_true or store_false cannot be undone in the command line. Use type=str2bool instead.')
-                else:
-                    self.parser.add_argument('--' + d['name'], action=d['action'], default=d['default'], help=d['help'])
-            else: # Non actions
-                if 'nargs' in d: # variable parameters
-                    if 'choices' in d: # choices with variable parameters
-                        self.parser.add_argument('--' + d['name'], nargs=d['nargs'], choices=d['choices'], default=d['default'], help=d['help'])
-                    else: # Variable parameters (free, no limited choices)
-                        self.parser.add_argument('--' + d['name'], nargs=d['nargs'], type=d['type'], default=d['default'], help=d['help'])
-                elif 'choices' in d: # Select from choice (fixed number of parameters)
-                    self.parser.add_argument('--' + d['name'], choices=d['choices'], default=d['default'], help=d['help'])
-                else: # Non an action, one parameter, no choices
-                    self.parser.add_argument('--' + d['name'], type=d['type'], default=d['default'], help=d['help'])
-
-
-
-    def format_benchmark_config_arguments(self, dictfileparam):
-        """ Functionality to format the particular parameters of
-            the benchmark.
-
-            Parameters
-            ----------
-            dictfileparam : python dictionary
-                parameters read from configuration file
-            args : python dictionary
-                parameters read from command-line
-                Most of the time command-line overwrites configuration file
-                except when the command-line is using default values and
-                config file defines those values
-
-        """
-
-        configOut = dictfileparam.copy()
-
-        for d in self.additional_definitions:
-            if d['name'] in configOut.keys():
-                if 'type' in d:
-                    dtype = d['type']
-                else:
-                    dtype = None
-
-                if 'action' in d:
-                    if inspect.isclass(d['action']):
-                        str_read = dictfileparam[d['name']]
-                        configOut[d['name']] = eval_string_as_list_of_lists(str_read, ':', ',', dtype)
-                elif d['default'] != argparse.SUPPRESS:
-                    # default value on benchmark definition cannot overwrite config file
-                    self.parser.add_argument('--' + d['name'], type=d['type'], default=configOut[d['name']], help=d['help'])
-
-        return configOut
-
-
-
-    def read_config_file(self, file):
-        """Functionality to read the configue file
-           specific for each benchmark.
-        """
-
-        config=configparser.ConfigParser()
-        config.read(file)
-        section=config.sections()
-        fileParams={}
-
-        # parse specified arguments (minimal validation: if arguments
-        # are written several times in the file, just the first time
-        # will be used)
-        for sec in section:
-            for k,v in config.items(sec):
-                if not k in fileParams:
-                    fileParams[k] = eval(v)
-        fileParams = self.format_benchmark_config_arguments(fileParams)
-        #pprint(fileParams)
-
-        return fileParams
-
-
-
-    def set_locals(self):
-        """ Functionality to set variables specific for the benchmark
-            - required: set of required parameters for the benchmark.
-            - additional_definitions: list of dictionaries describing \
-                the additional parameters for the benchmark.
-        """
-
-        pass
-
-
-
-    def check_required_exists(self, gparam):
-        """Functionality to verify that the required 
-           model parameters have been specified.
-        """
-
-        key_set = set(gparam.keys())
-        intersect_set = key_set.intersection(self.required)
-        diff_set = self.required.difference(intersect_set)
-
-        if ( len(diff_set) > 0 ):
-            raise Exception('ERROR ! Required parameters are not specified. ' \
-            'These required parameters have not been initialized: ' + str(sorted(diff_set)) + \
-            '... Exiting')
-
-
-
-def keras_default_config():
-    """Defines parameters that intervine in different functions using the keras defaults.
-        This helps to keep consistency in parameters between frameworks.
-    """
-
-    kerasDefaults = {}
-
-    # Optimizers
-    #kerasDefaults['clipnorm']=?            # Maximum norm to clip all parameter gradients
-    #kerasDefaults['clipvalue']=?          # Maximum (minimum=-max) value to clip all parameter gradients
-    kerasDefaults['decay_lr']=0.            # Learning rate decay over each update
-    kerasDefaults['epsilon']=1e-8           # Factor to avoid divide by zero (fuzz factor)
-    kerasDefaults['rho']=0.9                # Decay parameter in some optmizer updates (rmsprop, adadelta)
-    kerasDefaults['momentum_sgd']=0.        # Momentum for parameter update in sgd optimizer
-    kerasDefaults['nesterov_sgd']=False     # Whether to apply Nesterov momentum in sgd optimizer
-    kerasDefaults['beta_1']=0.9             # Parameter in some optmizer updates (adam, adamax, nadam)
-    kerasDefaults['beta_2']=0.999           # Parameter in some optmizer updates (adam, adamax, nadam)
-    kerasDefaults['decay_schedule_lr']=0.004# Parameter for nadam optmizer
-
-    # Initializers
-    kerasDefaults['minval_uniform']=-0.05   #  Lower bound of the range of random values to generate
-    kerasDefaults['maxval_uniform']=0.05    #  Upper bound of the range of random values to generate
-    kerasDefaults['mean_normal']=0.         #  Mean of the random values to generate
-    kerasDefaults['stddev_normal']=0.05     #  Standard deviation of the random values to generate
-
-
-    return kerasDefaults
-
diff --git a/examples/python/keras/candle_uno/file_utils.py b/examples/python/keras/candle_uno/file_utils.py
deleted file mode 100644
index 46e065d201..0000000000
--- a/examples/python/keras/candle_uno/file_utils.py
+++ /dev/null
@@ -1,183 +0,0 @@
-from __future__ import absolute_import
-from __future__ import print_function
-
-import tarfile
-import os
-import sys
-import shutil
-import hashlib
-from six.moves.urllib.request import urlopen
-from six.moves.urllib.error import URLError, HTTPError
-
-import requests
-from generic_utils import Progbar
-
-
-# Under Python 2, 'urlretrieve' relies on FancyURLopener from legacy
-# urllib module, known to have issues with proxy management
-if sys.version_info[0] == 2:
-    def urlretrieve(url, filename, reporthook=None, data=None):
-        def chunk_read(response, chunk_size=8192, reporthook=None):
-            total_size = response.info().get('Content-Length').strip()
-            total_size = int(total_size)
-            count = 0
-            while 1:
-                chunk = response.read(chunk_size)
-                count += 1
-                if not chunk:
-                    reporthook(count, total_size, total_size)
-                    break
-                if reporthook:
-                    reporthook(count, chunk_size, total_size)
-                yield chunk
-
-        response = urlopen(url, data)
-        with open(filename, 'wb') as fd:
-            for chunk in chunk_read(response, reporthook=reporthook):
-                fd.write(chunk)
-else:
-    from six.moves.urllib.request import urlretrieve
-
-
-def get_file(fname, origin, untar=False,
-             #md5_hash=None, datadir='../Data/common'):
-             #md5_hash=None, cache_subdir='common', datadir='../Data/common'):
-             md5_hash=None, cache_subdir='common', datadir=None): # datadir argument was never actually used so changing it to None
-    """ Downloads a file from a URL if it not already in the cache.
-        Passing the MD5 hash will verify the file after download as well
-        as if it is already present in the cache.
-
-        Parameters
-        ----------
-        fname : string
-            name of the file
-        origin : string
-            original URL of the file
-        untar : boolean
-            whether the file should be decompressed
-        md5_hash : string
-            MD5 hash of the file for verification
-        cache_subdir : string
-            directory being used as the cache
-        datadir : string
-            if set, datadir becomes its setting (which could be e.g. an absolute path) and cache_subdir no longer matters
-
-        Returns
-        ----------
-        Path to the downloaded file
-    """
-
-    if datadir is None:
-        file_path = os.path.dirname(os.path.realpath(__file__))
-        datadir_base = os.path.expanduser(os.path.join(file_path, '..', 'Data'))
-        datadir = os.path.join(datadir_base, cache_subdir)
-
-    if not os.path.exists(datadir):
-        os.makedirs(datadir)
-
-    #if untar:
-    #    fnamesplit = fname.split('.tar.gz')
-    #    untar_fpath = os.path.join(datadir, fnamesplit[0])
-
-    if fname.endswith('.tar.gz'):
-        fnamesplit = fname.split('.tar.gz')
-        untar_fpath = os.path.join(datadir, fnamesplit[0])
-        untar = True
-    elif fname.endswith('.tgz'):
-        fnamesplit = fname.split('.tgz')
-        untar_fpath = os.path.join(datadir, fnamesplit[0])
-        untar = True
-    else:
-        untar_fpath = None
-
-    fpath = os.path.join(datadir, fname)
-
-    download = False
-    if os.path.exists(fpath) or (untar_fpath is not None and os.path.exists(untar_fpath)):
-        # file found; verify integrity if a hash was provided
-        if md5_hash is not None:
-            if not validate_file(fpath, md5_hash):
-                print('A local file was found, but it seems to be '
-                      'incomplete or outdated.')
-                download = True
-    else:
-        download = True
-
-    # fix ftp protocol if needed
-    '''
-    if origin.startswith('ftp://'):
-        new_url = origin.replace('ftp://','http://')
-        origin = new_url
-    print('Origin = ', origin)
-    '''
-
-    if download:
-        print('Downloading data from', origin)
-        global progbar
-        progbar = None
-
-        def dl_progress(count, block_size, total_size):
-            global progbar
-            if progbar is None:
-                progbar = Progbar(total_size)
-            else:
-                progbar.update(count * block_size)
-
-        error_msg = 'URL fetch failure on {}: {} -- {}'
-        try:
-            try:
-                urlretrieve(origin, fpath, dl_progress)
-            except URLError as e:
-                raise Exception(error_msg.format(origin, e.errno, e.reason))
-            except HTTPError as e:
-                raise Exception(error_msg.format(origin, e.code, e.msg))
-        except (Exception, KeyboardInterrupt) as e:
-            if os.path.exists(fpath):
-                os.remove(fpath)
-            raise
-        progbar = None
-        print()
-
-    if untar:
-        if not os.path.exists(untar_fpath):
-            print('Untarring file...')
-            tfile = tarfile.open(fpath, 'r:gz')
-            try:
-                tfile.extractall(path=datadir)
-            except (Exception, KeyboardInterrupt) as e:
-                if os.path.exists(untar_fpath):
-                    if os.path.isfile(untar_fpath):
-                        os.remove(untar_fpath)
-                    else:
-                        shutil.rmtree(untar_fpath)
-                raise
-            tfile.close()
-        return untar_fpath
-        print()
-
-    return fpath
-
-
-def validate_file(fpath, md5_hash):
-    """ Validates a file against a MD5 hash
-
-        Parameters
-        ----------
-        fpath : string
-            path to the file being validated
-        md5_hash : string
-            the MD5 hash being validated against
-
-        Returns
-        ----------
-        boolean
-            Whether the file is valid
-    """
-    hasher = hashlib.md5()
-    with open(fpath, 'rb') as f:
-        buf = f.read()
-        hasher.update(buf)
-    if str(hasher.hexdigest()) == str(md5_hash):
-        return True
-    else:
-        return False
diff --git a/examples/python/keras/candle_uno/generic_utils.py b/examples/python/keras/candle_uno/generic_utils.py
deleted file mode 100644
index 7f72deb301..0000000000
--- a/examples/python/keras/candle_uno/generic_utils.py
+++ /dev/null
@@ -1,204 +0,0 @@
-from __future__ import absolute_import
-from __future__ import print_function
-import numpy as np
-import time
-import sys
-import os
-import six
-import marshal
-import types as python_types
-import logging
-
-
-def get_from_module(identifier, module_params, module_name,
-                    instantiate=False, kwargs=None):
-    if isinstance(identifier, six.string_types):
-        res = module_params.get(identifier)
-        if not res:
-            raise Exception('Invalid ' + str(module_name) + ': ' +
-                            str(identifier))
-        if instantiate and not kwargs:
-            return res()
-        elif instantiate and kwargs:
-            return res(**kwargs)
-        else:
-            return res
-    elif type(identifier) is dict:
-        name = identifier.pop('name')
-        res = module_params.get(name)
-        if res:
-            return res(**identifier)
-        else:
-            raise Exception('Invalid ' + str(module_name) + ': ' +
-                            str(identifier))
-    return identifier
-
-
-def make_tuple(*args):
-    return args
-
-
-def func_dump(func):
-    """ Serialize user defined function. """
-    code = marshal.dumps(func.__code__).decode('raw_unicode_escape')
-    defaults = func.__defaults__
-    if func.__closure__:
-        closure = tuple(c.cell_contents for c in func.__closure__)
-    else:
-        closure = None
-    return code, defaults, closure
-
-
-def func_load(code, defaults=None, closure=None, globs=None):
-    """ Deserialize user defined function. """
-    if isinstance(code, (tuple, list)):  # unpack previous dump
-        code, defaults, closure = code
-    code = marshal.loads(code.encode('raw_unicode_escape'))
-    if closure is not None:
-        closure = func_reconstruct_closure(closure)
-    if globs is None:
-        globs = globals()
-    return python_types.FunctionType(code, globs, name=code.co_name, argdefs=defaults, closure=closure)
-
-
-def func_reconstruct_closure(values):
-    """ Deserialization helper that reconstructs a closure. """
-    nums = range(len(values))
-    src = ["def func(arg):"]
-    src += ["  _%d = arg[%d]" % (n, n) for n in nums]
-    src += ["  return lambda:(%s)" % ','.join(["_%d" % n for n in nums]), ""]
-    src = '\n'.join(src)
-    try:
-        exec(src, globals())
-    except:
-        raise SyntaxError(src)
-    return func(values).__closure__
-
-
-class Progbar(object):
-    def __init__(self, target, width=30, verbose=1, interval=0.01):
-        """
-            Parameters
-            ------------
-            target: int
-                total number of steps expected
-            interval: float
-                minimum visual progress update interval (in seconds)
-        """
-        self.width = width
-        self.target = target
-        self.sum_values = {}
-        self.unique_values = []
-        self.start = time.time()
-        self.last_update = 0
-        self.interval = interval
-        self.total_width = 0
-        self.seen_so_far = 0
-        self.verbose = verbose
-
-    def update(self, current, values=[], force=False):
-        """
-            Parameters
-            ------------
-            current : int
-                index of current step
-            values : list of tuples (name, value_for_last_step).
-                The progress bar will display averages for these values.
-            force : boolean
-                force visual progress update
-        """
-        for k, v in values:
-            if k not in self.sum_values:
-                self.sum_values[k] = [v * (current - self.seen_so_far), current - self.seen_so_far]
-                self.unique_values.append(k)
-            else:
-                self.sum_values[k][0] += v * (current - self.seen_so_far)
-                self.sum_values[k][1] += (current - self.seen_so_far)
-        self.seen_so_far = current
-
-        now = time.time()
-        if self.verbose == 1:
-            if not force and (now - self.last_update) < self.interval:
-                return
-
-            prev_total_width = self.total_width
-            sys.stdout.write("\b" * prev_total_width)
-            sys.stdout.write("\r")
-
-            numdigits = int(np.floor(np.log10(self.target))) + 1
-            barstr = '%%%dd/%%%dd [' % (numdigits, numdigits)
-            bar = barstr % (current, self.target)
-            prog = float(current) / self.target
-            prog_width = int(self.width * prog)
-            if prog_width > 0:
-                bar += ('=' * (prog_width-1))
-                if current < self.target:
-                    bar += '>'
-                else:
-                    bar += '='
-            bar += ('.' * (self.width - prog_width))
-            bar += ']'
-            sys.stdout.write(bar)
-            self.total_width = len(bar)
-
-            if current:
-                time_per_unit = (now - self.start) / current
-            else:
-                time_per_unit = 0
-            eta = time_per_unit * (self.target - current)
-            info = ''
-            if current < self.target:
-                info += ' - ETA: %ds' % eta
-            else:
-                info += ' - %ds' % (now - self.start)
-            for k in self.unique_values:
-                info += ' - %s:' % k
-                if type(self.sum_values[k]) is list:
-                    avg = self.sum_values[k][0] / max(1, self.sum_values[k][1])
-                    if abs(avg) > 1e-3:
-                        info += ' %.4f' % avg
-                    else:
-                        info += ' %.4e' % avg
-                else:
-                    info += ' %s' % self.sum_values[k]
-
-            self.total_width += len(info)
-            if prev_total_width > self.total_width:
-                info += ((prev_total_width - self.total_width) * " ")
-
-            sys.stdout.write(info)
-            sys.stdout.flush()
-
-            if current >= self.target:
-                sys.stdout.write("\n")
-
-        if self.verbose == 2:
-            if current >= self.target:
-                info = '%ds' % (now - self.start)
-                for k in self.unique_values:
-                    info += ' - %s:' % k
-                    avg = self.sum_values[k][0] / max(1, self.sum_values[k][1])
-                    if avg > 1e-3:
-                        info += ' %.4f' % avg
-                    else:
-                        info += ' %.4e' % avg
-                sys.stdout.write(info + "\n")
-
-        self.last_update = now
-
-    def add(self, n, values=[]):
-        self.update(self.seen_so_far + n, values)
-
-
-def display_table(rows, positions):
-
-    def display_row(objects, positions):
-        line = ''
-        for i in range(len(objects)):
-            line += str(objects[i])
-            line = line[:positions[i]]
-            line += ' ' * (positions[i] - len(line))
-        print(line)
-
-    for objects in rows:
-        display_row(objects, positions)
diff --git a/examples/python/keras/candle_uno/uno.py b/examples/python/keras/candle_uno/uno.py
deleted file mode 100644
index baad1facf2..0000000000
--- a/examples/python/keras/candle_uno/uno.py
+++ /dev/null
@@ -1,236 +0,0 @@
-from __future__ import print_function
-
-import os
-import sys
-import logging
-import argparse
-try:
-    import configparser
-except ImportError:
-    import ConfigParser as configparser
-
-file_path = os.path.dirname(os.path.realpath(__file__))
-lib_path = os.path.abspath(os.path.join(file_path, '..'))
-sys.path.append(lib_path)
-lib_path2 = os.path.abspath(os.path.join(file_path, '..', '..', 'common'))
-sys.path.append(lib_path2)
-
-from default_utils import Benchmark, str2bool
-
-logger = logging.getLogger(__name__)
-
-
-class BenchmarkUno(Benchmark):
-
-    def set_locals(self):
-        """Functionality to set variables specific for the benchmark
-        - required: set of required parameters for the benchmark.
-        - additional_definitions: list of dictionaries describing the additional parameters for the
-        benchmark.
-        """
-
-        if required is not None:
-            self.required = set(required)
-        if additional_definitions is not None:
-            self.additional_definitions = additional_definitions
-
-
-additional_definitions = [
-    # Feature selection
-    {'name': 'agg_dose',
-        'type': str,
-        'default': None,
-        'choices': ['AUC', 'IC50', 'EC50', 'HS', 'AAC1', 'AUC1', 'DSS1'],
-        'help': 'use dose-independent response data with the specified aggregation metric'},
-    {'name': 'cell_features',
-        'nargs': '+',
-        'choices': ['rnaseq', 'none'],
-        'help': 'use rnaseq cell line feature set or none at all'},
-    {'name': 'drug_features',
-        'nargs': '+',
-        'choices': ['descriptors', 'fingerprints', 'none', 'mordred'],
-        'help': 'use dragon7 descriptors or fingerprint descriptors for drug features or none at all'},
-    {'name': 'by_cell',
-        'type': str,
-        'default': None,
-        'help': 'sample ID for building a by-cell model'},
-    {'name': 'by_drug',
-        'type': str,
-        'default': None,
-        'help': 'drug ID or name for building a by-drug model'},
-    # Data set selection
-    {'name': 'train_sources',
-        'nargs': '+',
-        'choices': ['all', 'CCLE', 'CTRP', 'gCSI', 'GDSC', 'NCI60', 'SCL', 'SCLC', 'ALMANAC'],
-        'help': 'use one or more sources of drug response data for training'},
-    {'name': 'test_sources',
-        'nargs': '+',
-        'choices': ['train', 'all', 'CCLE', 'CTRP', 'gCSI', 'GDSC', 'NCI60', 'SCL', 'SCLC', 'ALMANAC'],
-        'help': 'use one or more sources of drug response data for testing'},
-    # Sample selection
-    {'name': 'cell_types',
-        'nargs': '+',
-        'help': 'limit training and test data to one or more tissue types'},
-    {'name': 'cell_subset_path',
-        'type': str,
-        'default': '',
-        'help': 'path for file with space delimited molecular sample IDs to keep'},
-    {'name': 'drug_subset_path',
-        'type': str,
-        'default': '',
-        'help': 'path for file with space delimited drug IDs to keep'},
-    {'name': 'drug_median_response_min',
-        'type': float,
-        'default': -1,
-        'help': 'keep drugs whose median response is greater than the threshold'},
-    {'name': 'drug_median_response_max',
-        'type': float,
-        'default': 1,
-        'help': 'keep drugs whose median response is less than the threshold'},
-    # Training
-    {'name': 'no_feature_source',
-        'type': str2bool,
-        'default': False,
-        'help': 'do not embed cell or drug feature source as part of input'},
-    {'name': 'no_response_source',
-        'type': str2bool,
-        'default': False,
-        'help': 'do not encode response data source as an input feature'},
-    {'name': 'dense_feature_layers',
-        'nargs': '+',
-        'type': int,
-        'help': 'number of neurons in intermediate dense layers in the feature encoding submodels'},
-    {'name': 'dense_cell_feature_layers',
-        'nargs': '+',
-        'type': int,
-        'default': None,
-        'help': 'number of neurons in intermediate dense layers in the cell feature encoding submodels'},
-    {'name': 'dense_drug_feature_layers',
-        'nargs': '+',
-        'type': int,
-        'default': None,
-        'help': 'number of neurons in intermediate dense layers in the drug feature encoding submodels'},
-    {'name': 'use_landmark_genes',
-        'type': str2bool,
-        'default': False,
-        'help': 'use the 978 landmark genes from LINCS (L1000) as expression features'},
-    {'name': 'use_filtered_genes',
-        'type': str2bool,
-        'default': False,
-        'help': 'use the variance filtered genes as expression features'},
-    {'name': 'feature_subset_path',
-        'type': str,
-        'default': '',
-        'help': 'path for file with space delimited features to keep'},
-    {'name': 'cell_feature_subset_path',
-        'type': str,
-        'default': '',
-        'help': 'path for file with space delimited molecular features to keep'},
-    {'name': 'drug_feature_subset_path',
-        'type': str,
-        'default': '',
-        'help': 'path for file with space delimited drug features to keep'},
-    {'name': 'preprocess_rnaseq',
-        'choices': ['source_scale', 'combat', 'none'],
-        'default': 'none',
-        'help': 'preprocessing method for RNAseq data; none for global normalization'},
-    {'name': 'residual',
-        'type': str2bool,
-        'default': False,
-        'help': 'add skip connections to the layers'},
-    {'name': 'reduce_lr',
-        'type': str2bool,
-        'default': False,
-        'help': 'reduce learning rate on plateau'},
-    {'name': 'warmup_lr',
-        'type': str2bool,
-        'default': False,
-        'help': 'gradually increase learning rate on start'},
-    {'name': 'base_lr',
-        'type': float,
-        'default': None,
-        'help': 'base learning rate'},
-    {'name': 'es',
-        'type': str2bool,
-        'default': False,
-        'help': 'early stopping on val_loss'},
-    {'name': 'cp',
-        'type': str2bool,
-        'default': False,
-        'help': 'checkpoint models with best val_loss'},
-    {'name': 'tb',
-        'type': str2bool,
-        'default': False,
-        'help': 'use tensorboard'},
-    {'name': 'tb_prefix',
-        'type': str,
-        'default': 'tb',
-        'help': 'prefix name for tb log'},
-    {'name': 'max_val_loss',
-        'type': float,
-        'default': argparse.SUPPRESS,
-        'help': 'retrain if val_loss is greater than the threshold'},
-    {'name': 'partition_by',
-        'choices': ['index', 'drug_pair', 'cell'],
-        'default': None,
-        'help': 'cross validation paritioning scheme'},
-    {'name': 'cv',
-        'type': int,
-        'default': argparse.SUPPRESS,
-        'help': 'cross validation folds'},
-    {'name': 'no_gen',
-        'type': str2bool,
-        'default': False,
-        'help': 'do not use generator for training and validation data'},
-    {'name': 'cache',
-        'type': str,
-        'default': None,
-        'help': 'prefix of data cache files to use'},
-    {'name': 'single',
-        'type': str2bool,
-        'default': False,
-        'help': 'do not use drug pair representation'},
-    {'name': 'export_csv',
-        'type': str,
-        'default': None,
-        'help': 'output csv file name'},
-    {'name': 'export_data',
-        'type': str,
-        'default': None,
-        'help': 'output dataframe file name'},
-    {'name': 'use_exported_data',
-        'type': str,
-        'default': None,
-        'help': 'exported file name'},
-    {'name': 'growth_bins',
-        'type': int,
-        'default': 0,
-        'help': 'number of bins to use when discretizing growth response'},
-    {'name': 'initial_weights',
-        'type': str,
-        'default': None,
-        'help': 'file name of initial weights'},
-    {'name': 'save_weights',
-        'type': str,
-        'default': None,
-        'help': 'name of file to save weights to'}
-]
-
-required = [
-    'activation',
-    'batch_size',
-    'dense',
-    'dense_feature_layers',
-    'dropout',
-    'epochs',
-    'feature_subsample',
-    'learning_rate',
-    'loss',
-    'optimizer',
-    'residual',
-    'rng_seed',
-    'save_path',
-    'scaling',
-    'val_split',
-    'timeout'
-]
diff --git a/examples/python/keras/candle_uno/uno_data.py b/examples/python/keras/candle_uno/uno_data.py
deleted file mode 100644
index 251d3fcf5a..0000000000
--- a/examples/python/keras/candle_uno/uno_data.py
+++ /dev/null
@@ -1,1203 +0,0 @@
-from __future__ import print_function
-
-import collections
-import json
-import logging
-import os
-import pickle
-import sys
-
-import numpy as np
-import pandas as pd
-import flexflow.keras as keras
-
-from itertools import cycle, islice
-
-try:
-    from sklearn.impute import SimpleImputer as Imputer
-except ImportError:
-    from sklearn.preprocessing import Imputer
-
-from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler
-from sklearn.model_selection import ShuffleSplit, KFold
-
-import file_utils
-
-file_path = os.path.dirname(os.path.realpath(__file__))
-lib_path = os.path.abspath(os.path.join(file_path, '..', '..', 'common'))
-sys.path.append(lib_path)
-
-# import candle
-import file_utils
-
-global_cache = {}
-
-SEED = 2018
-
-P1B3_URL = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B3/'
-DATA_URL = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/combo/'
-
-logger = logging.getLogger(__name__)
-
-
-def set_up_logger(verbose=False):
-    sh = logging.StreamHandler()
-    sh.setFormatter(logging.Formatter(''))
-    sh.setLevel(logging.DEBUG if verbose else logging.INFO)
-
-    logger.setLevel(logging.DEBUG)
-    logger.addHandler(sh)
-
-
-def set_seed(seed=SEED):
-    os.environ['PYTHONHASHSEED'] = '0'
-    np.random.seed(seed)
-    random.seed(seed)
-
-
-def get_file(url):
-    fname = os.path.basename(url)
-    return file_utils.get_file(fname, origin=url, cache_subdir='Pilot1')
-
-
-def impute_and_scale(df, scaling='std', imputing='mean', dropna='all'):
-    """Impute missing values with mean and scale data included in pandas dataframe.
-
-    Parameters
-    ----------
-    df : pandas dataframe
-        dataframe to impute and scale
-    scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std')
-        type of scaling to apply
-    """
-
-    if dropna:
-        df = df.dropna(axis=1, how=dropna)
-    else:
-        empty_cols = df.columns[df.notnull().sum() == 0]
-        df[empty_cols] = 0
-
-    if imputing is None or imputing.lower() == 'none':
-        mat = df.values
-    else:
-        imputer = Imputer(strategy=imputing)
-        mat = imputer.fit_transform(df)
-
-    if scaling is None or scaling.lower() == 'none':
-        return pd.DataFrame(mat, columns=df.columns)
-
-    if scaling == 'maxabs':
-        scaler = MaxAbsScaler()
-    elif scaling == 'minmax':
-        scaler = MinMaxScaler()
-    else:
-        scaler = StandardScaler()
-
-    mat = scaler.fit_transform(mat)
-    df = pd.DataFrame(mat, columns=df.columns)
-
-    return df
-
-
-def discretize(df, col, bins=2, cutoffs=None):
-    y = df[col]
-    thresholds = cutoffs
-    if thresholds is None:
-        percentiles = [100 / bins * (i + 1) for i in range(bins - 1)]
-        thresholds = [np.percentile(y, x) for x in percentiles]
-    classes = np.digitize(y, thresholds)
-    df[col] = classes
-    return df
-
-
-def save_combined_dose_response():
-    df1 = load_single_dose_response(combo_format=True, fraction=False)
-    df2 = load_combo_dose_response(fraction=False)
-    df = pd.concat([df1, df2])
-    df.to_csv('combined_drug_growth', index=False, sep='\t')
-
-
-def load_combined_dose_response(rename=True):
-    df1 = load_single_dose_response(combo_format=True)
-    logger.info('Loaded {} single drug dose response measurements'.format(df1.shape[0]))
-
-    df2 = load_combo_dose_response()
-    logger.info('Loaded {} drug pair dose response measurements'.format(df2.shape[0]))
-
-    df = pd.concat([df1, df2])
-    logger.info('Combined dose response data contains sources: {}'.format(df['SOURCE'].unique()))
-
-    if rename:
-        df = df.rename(columns={'SOURCE': 'Source', 'CELL': 'Sample',
-                                'DRUG1': 'Drug1', 'DRUG2': 'Drug2',
-                                'DOSE1': 'Dose1', 'DOSE2': 'Dose2',
-                                'GROWTH': 'Growth', 'STUDY': 'Study'})
-    return df
-
-
-def load_single_dose_response(combo_format=False, fraction=True):
-    print('DATA_URL' + DATA_URL)
-    # path = get_file(DATA_URL + 'combined_single_drug_growth')
-    path = get_file(DATA_URL + 'rescaled_combined_single_drug_growth')
-
-    df = global_cache.get(path)
-    if df is None:
-        df = pd.read_csv(path, sep='\t', engine='c',
-                         na_values=['na', '-', ''],
-                         # nrows=10,
-                         dtype={'SOURCE': str, 'DRUG_ID': str,
-                                'CELLNAME': str, 'CONCUNIT': str,
-                                'LOG_CONCENTRATION': np.float32,
-                                'EXPID': str, 'GROWTH': np.float32})
-        global_cache[path] = df
-
-    df['DOSE'] = -df['LOG_CONCENTRATION']
-
-    df = df.rename(columns={'CELLNAME': 'CELL', 'DRUG_ID': 'DRUG', 'EXPID': 'STUDY'})
-    df = df[['SOURCE', 'CELL', 'DRUG', 'DOSE', 'GROWTH', 'STUDY']]
-
-    if fraction:
-        df['GROWTH'] /= 100
-
-    if combo_format:
-        df = df.rename(columns={'DRUG': 'DRUG1', 'DOSE': 'DOSE1'})
-        df['DRUG2'] = np.nan
-        df['DOSE2'] = np.nan
-        df['DRUG2'] = df['DRUG2'].astype(object)
-        df['DOSE2'] = df['DOSE2'].astype(np.float32)
-        df = df[['SOURCE', 'CELL', 'DRUG1', 'DOSE1', 'DRUG2', 'DOSE2', 'GROWTH', 'STUDY']]
-
-    return df
-
-
-def load_combo_dose_response(fraction=True):
-    path = get_file(DATA_URL + 'ComboDrugGrowth_Nov2017.csv')
-    df = global_cache.get(path)
-    if df is None:
-        df = pd.read_csv(path, sep=',', engine='c',
-                         na_values=['na', '-', ''],
-                         usecols=['CELLNAME', 'NSC1', 'CONC1', 'NSC2', 'CONC2',
-                                  'PERCENTGROWTH', 'VALID', 'SCREENER', 'STUDY'],
-                         # nrows=10000,
-                         dtype={'CELLNAME': str, 'NSC1': str, 'NSC2': str,
-                                'CONC1': np.float32, 'CONC2': np.float32,
-                                'PERCENTGROWTH': np.float32, 'VALID': str,
-                                'SCREENER': str, 'STUDY': str},
-                         error_bad_lines=False, warn_bad_lines=True)
-        global_cache[path] = df
-
-    df = df[df['VALID'] == 'Y']
-
-    df['SOURCE'] = 'ALMANAC.' + df['SCREENER']
-
-    cellmap_path = get_file(DATA_URL + 'NCI60_CELLNAME_to_Combo.txt')
-    df_cellmap = pd.read_csv(cellmap_path, sep='\t')
-    df_cellmap.set_index('Name', inplace=True)
-    cellmap = df_cellmap[['NCI60.ID']].to_dict()['NCI60.ID']
-
-    df['CELL'] = df['CELLNAME'].map(lambda x: cellmap[x])
-
-    df['DOSE1'] = -np.log10(df['CONC1'])
-    df['DOSE2'] = -np.log10(df['CONC2'])
-
-    df['DRUG1'] = 'NSC.' + df['NSC1']
-    df['DRUG2'] = 'NSC.' + df['NSC2']
-
-    if fraction:
-        df['GROWTH'] = df['PERCENTGROWTH'] / 100
-    else:
-        df['GROWTH'] = df['PERCENTGROWTH']
-
-    df = df[['SOURCE', 'CELL', 'DRUG1', 'DOSE1', 'DRUG2', 'DOSE2', 'GROWTH', 'STUDY']]
-
-    return df
-
-
-def load_aggregated_single_response(target='AUC', min_r2_fit=0.3, max_ec50_se=3, combo_format=False, rename=True):
-    path = get_file(DATA_URL + 'combined_single_response_agg')
-
-    df = global_cache.get(path)
-    if df is None:
-        df = pd.read_csv(path, engine='c', sep='\t',
-                         dtype={'SOURCE': str, 'CELL': str, 'DRUG': str, 'STUDY': str,
-                                'AUC': np.float32, 'IC50': np.float32,
-                                'EC50': np.float32, 'EC50se': np.float32,
-                                'R2fit': np.float32, 'Einf': np.float32,
-                                'HS': np.float32, 'AAC1': np.float32,
-                                'AUC1': np.float32, 'DSS1': np.float32})
-        global_cache[path] = df
-
-    total = len(df)
-
-    df = df[(df['R2fit'] >= min_r2_fit) & (df['EC50se'] <= max_ec50_se)]
-    df = df[['SOURCE', 'CELL', 'DRUG', target, 'STUDY']]
-    df = df[~df[target].isnull()]
-
-    logger.info('Loaded %d dose indepdendent response samples (filtered by EC50se <= %f & R2fit >=%f from a total of %d).', len(df), max_ec50_se, min_r2_fit, total)
-
-    if combo_format:
-        df = df.rename(columns={'DRUG': 'DRUG1'})
-        df['DRUG2'] = np.nan
-        df['DRUG2'] = df['DRUG2'].astype(object)
-        df = df[['SOURCE', 'CELL', 'DRUG1', 'DRUG2', target, 'STUDY']]
-        if rename:
-            df = df.rename(columns={'SOURCE': 'Source', 'CELL': 'Sample',
-                                    'DRUG1': 'Drug1', 'DRUG2': 'Drug2', 'STUDY': 'Study'})
-    else:
-        if rename:
-            df = df.rename(columns={'SOURCE': 'Source', 'CELL': 'Sample',
-                                    'DRUG': 'Drug', 'STUDY': 'Study'})
-
-    return df
-
-
-def load_drug_data(ncols=None, scaling='std', imputing='mean', dropna=None, add_prefix=True):
-    df_info = load_drug_info()
-    df_info['Drug'] = df_info['PUBCHEM']
-
-    df_desc = load_drug_set_descriptors(drug_set='Combined_PubChem', ncols=ncols)
-    df_fp = load_drug_set_fingerprints(drug_set='Combined_PubChem', ncols=ncols)
-
-    df_desc = pd.merge(df_info[['ID', 'Drug']], df_desc, on='Drug').drop('Drug', 1).rename(columns={'ID': 'Drug'})
-    df_fp = pd.merge(df_info[['ID', 'Drug']], df_fp, on='Drug').drop('Drug', 1).rename(columns={'ID': 'Drug'})
-
-    df_desc2 = load_drug_set_descriptors(drug_set='NCI60', usecols=df_desc.columns.tolist() if ncols else None)
-    df_fp2 = load_drug_set_fingerprints(drug_set='NCI60', usecols=df_fp.columns.tolist() if ncols else None)
-
-    df_desc = pd.concat([df_desc, df_desc2]).reset_index(drop=True)
-    df1 = pd.DataFrame(df_desc.loc[:, 'Drug'])
-    df2 = df_desc.drop('Drug', 1)
-    df2 = impute_and_scale(df2, scaling=scaling, imputing=imputing, dropna=dropna)
-    if add_prefix:
-        df2 = df2.add_prefix('dragon7.')
-    df_desc = pd.concat([df1, df2], axis=1)
-
-    df_fp = pd.concat([df_fp, df_fp2]).reset_index(drop=True)
-    df1 = pd.DataFrame(df_fp.loc[:, 'Drug'])
-    df2 = df_fp.drop('Drug', 1)
-    df2 = impute_and_scale(df2, scaling=None, imputing=imputing, dropna=dropna)
-    if add_prefix:
-        df2 = df2.add_prefix('dragon7.')
-    df_fp = pd.concat([df1, df2], axis=1)
-
-    logger.info('Loaded combined dragon7 drug descriptors: %s', df_desc.shape)
-    logger.info('Loaded combined dragon7 drug fingerprints: %s', df_fp.shape)
-
-    return df_desc, df_fp
-
-
-def load_mordred_descriptors(ncols=None, scaling='std', imputing='mean', dropna=None, add_prefix=True, feature_subset=None):
-    path = get_file(DATA_URL + 'extended_combined_mordred_descriptors')
-
-    df = pd.read_csv(path, engine='c', sep='\t', na_values=['na', '-', ''])
-    df.iloc[:, 1:] = df.iloc[:, 1:].apply(pd.to_numeric, errors='coerce')
-    df.iloc[:, 1:] = df.iloc[:, 1:].astype(np.float32)
-
-    df1 = pd.DataFrame(df.loc[:, 'DRUG'])
-    df1.rename(columns={'DRUG': 'Drug'}, inplace=True)
-
-    df2 = df.drop('DRUG', 1)
-    if add_prefix:
-        df2 = df2.add_prefix('mordred.')
-
-    df2 = impute_and_scale(df2, scaling, imputing)
-
-    df_desc = pd.concat([df1, df2], axis=1)
-
-    df1 = pd.DataFrame(df_desc.loc[:, 'Drug'])
-    df2 = df_desc.drop('Drug', 1)
-    if add_prefix:
-        df2 = df2.add_prefix('mordred.')
-    if feature_subset:
-        df2 = df2[[x for x in df2.columns if x in feature_subset]]
-    df2 = impute_and_scale(df2, scaling=scaling, imputing=imputing, dropna=dropna)
-    df_desc = pd.concat([df1, df2], axis=1)
-
-    logger.info('Loaded Mordred drug descriptors: %s', df_desc.shape)
-
-    return df_desc
-
-
-def load_drug_descriptors(ncols=None, scaling='std', imputing='mean', dropna=None, add_prefix=True, feature_subset=None):
-    df_info = load_drug_info()
-    df_info['Drug'] = df_info['PUBCHEM']
-
-    df_desc = load_drug_set_descriptors(drug_set='Combined_PubChem', ncols=ncols)
-    df_desc = pd.merge(df_info[['ID', 'Drug']], df_desc, on='Drug').drop('Drug', 1).rename(columns={'ID': 'Drug'})
-
-    df_desc2 = load_drug_set_descriptors(drug_set='NCI60', usecols=df_desc.columns.tolist() if ncols else None)
-
-    df_desc = pd.concat([df_desc, df_desc2]).reset_index(drop=True)
-    df1 = pd.DataFrame(df_desc.loc[:, 'Drug'])
-    df2 = df_desc.drop('Drug', 1)
-    if add_prefix:
-        df2 = df2.add_prefix('dragon7.')
-    if feature_subset:
-        df2 = df2[[x for x in df2.columns if x in feature_subset]]
-    df2 = impute_and_scale(df2, scaling=scaling, imputing=imputing, dropna=dropna)
-    df_desc = pd.concat([df1, df2], axis=1)
-
-    logger.info('Loaded combined dragon7 drug descriptors: %s', df_desc.shape)
-
-    return df_desc
-
-
-def load_drug_fingerprints(ncols=None, scaling='std', imputing='mean', dropna=None, add_prefix=True, feature_subset=None):
-    df_info = load_drug_info()
-    df_info['Drug'] = df_info['PUBCHEM']
-
-    df_fp = load_drug_set_fingerprints(drug_set='Combined_PubChem', ncols=ncols)
-    df_fp = pd.merge(df_info[['ID', 'Drug']], df_fp, on='Drug').drop('Drug', 1).rename(columns={'ID': 'Drug'})
-
-    df_fp2 = load_drug_set_fingerprints(drug_set='NCI60', usecols=df_fp.columns.tolist() if ncols else None)
-
-    df_fp = pd.concat([df_fp, df_fp2]).reset_index(drop=True)
-    df1 = pd.DataFrame(df_fp.loc[:, 'Drug'])
-    df2 = df_fp.drop('Drug', 1)
-    if add_prefix:
-        df2 = df2.add_prefix('dragon7.')
-    if feature_subset:
-        df2 = df2[[x for x in df2.columns if x in feature_subset]]
-    df2 = impute_and_scale(df2, scaling=None, imputing=imputing, dropna=dropna)
-    df_fp = pd.concat([df1, df2], axis=1)
-
-    logger.info('Loaded combined dragon7 drug fingerprints: %s', df_fp.shape)
-
-    return df_fp
-
-
-def load_drug_info():
-    path = get_file(DATA_URL + 'drug_info')
-    df = pd.read_csv(path, sep='\t', dtype=object)
-    df['PUBCHEM'] = 'PubChem.CID.' + df['PUBCHEM']
-    return df
-
-
-def lookup(df, query, ret, keys, match='match'):
-    mask = pd.Series(False, index=range(df.shape[0]))
-    for key in keys:
-        if match == 'contains':
-            mask |= df[key].str.contains(query.upper(), case=False)
-        else:
-            mask |= (df[key].str.upper() == query.upper())
-    return list(set(df[mask][ret].values.flatten().tolist()))
-
-
-def load_cell_metadata():
-    path = get_file(DATA_URL + 'cl_metadata')
-    df = pd.read_csv(path, sep='\t')
-    return df
-
-
-def cell_name_to_ids(name, source=None):
-    path = get_file(DATA_URL + 'NCI60_CELLNAME_to_Combo.txt')
-    df1 = pd.read_csv(path, sep='\t')
-    hits1 = lookup(df1, name, 'NCI60.ID', ['NCI60.ID', 'CELLNAME', 'Name'], match='contains')
-    path = get_file(DATA_URL + 'cl_mapping')
-    df2 = pd.read_csv(path, sep='\t', header=None)
-    hits2 = lookup(df2, name, [0, 1], [0, 1], match='contains')
-    hits = hits1 + hits2
-    if source:
-        hits = [x for x in hits if x.startswith(source.upper() + '.')]
-    return hits
-
-
-def drug_name_to_ids(name, source=None):
-    df1 = load_drug_info()
-    path = get_file(DATA_URL + 'NCI_IOA_AOA_drugs')
-    df2 = pd.read_csv(path, sep='\t', dtype=str)
-    df2['NSC'] = 'NSC.' + df2['NSC']
-    hits1 = lookup(df1, name, 'ID', ['ID', 'NAME', 'CLEAN_NAME', 'PUBCHEM'])
-    hits2 = lookup(df2, name, 'NSC', ['NSC', 'Generic Name', 'Preffered Name'])
-    hits = hits1 + hits2
-    if source:
-        hits = [x for x in hits if x.startswith(source.upper() + '.')]
-    return hits
-
-
-def load_drug_set_descriptors(drug_set='Combined_PubChem', ncols=None, usecols=None,
-                              scaling=None, imputing=None, add_prefix=False):
-    path = get_file(DATA_URL + '{}_dragon7_descriptors.tsv'.format(drug_set))
-
-    df_cols = pd.read_csv(path, engine='c', sep='\t', nrows=0)
-    total = df_cols.shape[1] - 1
-    if usecols is not None:
-        usecols = [x for x in usecols if x in df_cols.columns]
-        if usecols[0] != 'NAME':
-            usecols = ['NAME'] + usecols
-        df_cols = df_cols.loc[:, usecols]
-    elif ncols and ncols < total:
-        usecols = np.random.choice(total, size=ncols, replace=False)
-        usecols = np.append([0], np.add(sorted(usecols), 1))
-        df_cols = df_cols.iloc[:, usecols]
-
-    dtype_dict = dict((x, np.float32) for x in df_cols.columns[1:])
-    df = pd.read_csv(path, engine='c', sep='\t', usecols=usecols, dtype=dtype_dict,
-                     na_values=['na', '-', ''])
-
-    df1 = pd.DataFrame(df.loc[:, 'NAME'])
-    df1.rename(columns={'NAME': 'Drug'}, inplace=True)
-
-    df2 = df.drop('NAME', 1)
-    if add_prefix:
-        df2 = df2.add_prefix('dragon7.')
-
-    df2 = impute_and_scale(df2, scaling, imputing, dropna=None)
-
-    df = pd.concat([df1, df2], axis=1)
-    return df
-
-
-def load_drug_set_fingerprints(drug_set='Combined_PubChem', ncols=None, usecols=None,
-                               scaling=None, imputing=None, add_prefix=False):
-    fps = ['PFP', 'ECFP']
-    usecols_all = usecols
-    df_merged = None
-    for fp in fps:
-        path = get_file(DATA_URL + '{}_dragon7_{}.tsv'.format(drug_set, fp))
-        df_cols = pd.read_csv(path, engine='c', sep='\t', nrows=0, skiprows=1, header=None)
-        total = df_cols.shape[1] - 1
-        if usecols_all is not None:
-            usecols = [x.replace(fp + '.', '') for x in usecols_all]
-            usecols = [int(x) for x in usecols if x.isdigit()]
-            usecols = [x for x in usecols if x in df_cols.columns]
-            if usecols[0] != 0:
-                usecols = [0] + usecols
-            df_cols = df_cols.loc[:, usecols]
-        elif ncols and ncols < total:
-            usecols = np.random.choice(total, size=ncols, replace=False)
-            usecols = np.append([0], np.add(sorted(usecols), 1))
-            df_cols = df_cols.iloc[:, usecols]
-
-        dtype_dict = dict((x, np.float32) for x in df_cols.columns[1:])
-        df = pd.read_csv(path, engine='c', sep='\t', skiprows=1, header=None,
-                         usecols=usecols, dtype=dtype_dict)
-        df.columns = ['{}.{}'.format(fp, x) for x in df.columns]
-
-        col1 = '{}.0'.format(fp)
-        df1 = pd.DataFrame(df.loc[:, col1])
-        df1.rename(columns={col1: 'Drug'}, inplace=True)
-
-        df2 = df.drop(col1, 1)
-        if add_prefix:
-            df2 = df2.add_prefix('dragon7.')
-
-        df2 = impute_and_scale(df2, scaling, imputing, dropna=None)
-
-        df = pd.concat([df1, df2], axis=1)
-
-        df_merged = df if df_merged is None else df_merged.merge(df)
-
-    return df_merged
-
-
-# def load_drug_smiles():
-#     path = get_file(DATA_URL + 'ChemStructures_Consistent.smiles')
-
-#     df = global_cache.get(path)
-#     if df is None:
-#         df = pd.read_csv(path, sep='\t', engine='c', dtype={'nsc_id':object})
-#         df = df.rename(columns={'nsc_id': 'NSC'})
-#         global_cache[path] = df
-
-#     return df
-
-def encode_sources(sources):
-    df = pd.get_dummies(sources, prefix='source', prefix_sep='.')
-    df['Source'] = sources
-    source_l1 = df['Source'].str.extract('^(\S+)\.', expand=False)
-    df1 = pd.get_dummies(source_l1, prefix='source.L1', prefix_sep='.')
-    df = pd.concat([df1, df], axis=1)
-    df = df.set_index('Source').reset_index()
-    return df
-
-
-def load_cell_rnaseq(ncols=None, scaling='std', imputing='mean', add_prefix=True,
-                     use_landmark_genes=False, use_filtered_genes=False,
-                     feature_subset=None, preprocess_rnaseq=None,
-                     embed_feature_source=False, sample_set=None, index_by_sample=False):
-
-    if use_landmark_genes:
-        filename = 'combined_rnaseq_data_lincs1000'
-    elif use_filtered_genes:
-        filename = 'combined_rnaseq_data_filtered'
-    else:
-        filename = 'combined_rnaseq_data'
-
-    if preprocess_rnaseq and preprocess_rnaseq != 'none':
-        scaling = None
-        filename += ('_' + preprocess_rnaseq)  # 'source_scale' or 'combat'
-
-    path = get_file(DATA_URL + filename)
-    df_cols = pd.read_csv(path, engine='c', sep='\t', nrows=0)
-    total = df_cols.shape[1] - 1  # remove Sample column
-    if 'Cancer_type_id' in df_cols.columns:
-        total -= 1
-    usecols = None
-    if ncols and ncols < total:
-        usecols = np.random.choice(total, size=ncols, replace=False)
-        usecols = np.append([0], np.add(sorted(usecols), 2))
-        df_cols = df_cols.iloc[:, usecols]
-    if feature_subset:
-        with_prefix = lambda x: 'rnaseq.' + x if add_prefix else x
-        usecols = [0] + [i for i, c in enumerate(df_cols.columns) if with_prefix(c) in feature_subset]
-        df_cols = df_cols.iloc[:, usecols]
-
-    dtype_dict = dict((x, np.float32) for x in df_cols.columns[1:])
-    df = pd.read_csv(path, engine='c', sep='\t', usecols=usecols, dtype=dtype_dict)
-    if 'Cancer_type_id' in df.columns:
-        df.drop('Cancer_type_id', axis=1, inplace=True)
-
-    prefixes = df['Sample'].str.extract('^([^.]*)', expand=False).rename('Source')
-    sources = prefixes.drop_duplicates().reset_index(drop=True)
-    df_source = pd.get_dummies(sources, prefix='rnaseq.source', prefix_sep='.')
-    df_source = pd.concat([sources, df_source], axis=1)
-
-    df1 = df['Sample']
-    if embed_feature_source:
-        df_sample_source = pd.concat([df1, prefixes], axis=1)
-        df1 = df_sample_source.merge(df_source, on='Source', how='left').drop('Source', axis=1)
-        logger.info('Embedding RNAseq data source into features: %d additional columns', df1.shape[1] - 1)
-
-    df2 = df.drop('Sample', 1)
-    if add_prefix:
-        df2 = df2.add_prefix('rnaseq.')
-
-    df2 = impute_and_scale(df2, scaling, imputing)
-
-    df = pd.concat([df1, df2], axis=1)
-
-    # scaling needs to be done before subsampling
-    if sample_set:
-        chosen = df['Sample'].str.startswith(sample_set)
-        df = df[chosen].reset_index(drop=True)
-
-    if index_by_sample:
-        df = df.set_index('Sample')
-
-    logger.info('Loaded combined RNAseq data: %s', df.shape)
-
-    return df
-
-
-def read_set_from_file(path):
-    if path:
-        with open(path, 'r') as f:
-            text = f.read().strip()
-            subset = text.split()
-    else:
-        subset = None
-    return subset
-
-
-def select_drugs_with_response_range(df_response, lower=0, upper=0, span=0, lower_median=None, upper_median=None):
-    df = df_response.groupby(['Drug1', 'Sample'])['Growth'].agg(['min', 'max', 'median'])
-    df['span'] = df['max'].clip(lower=-1, upper=1) - df['min'].clip(lower=-1, upper=1)
-    df = df.groupby('Drug1').mean().reset_index().rename(columns={'Drug1': 'Drug'})
-    mask = (df['min'] <= lower) & (df['max'] >= upper) & (df['span'] >= span)
-    if lower_median:
-        mask &= (df['median'] >= lower_median)
-    if upper_median:
-        mask &= (df['median'] <= upper_median)
-    df_sub = df[mask]
-    return df_sub
-
-
-def summarize_response_data(df, target=None):
-    target = target or 'Growth'
-    df_sum = df.groupby('Source').agg({target: 'count', 'Sample': 'nunique',
-                                       'Drug1': 'nunique', 'Drug2': 'nunique'})
-    if 'Dose1' in df_sum:
-        df_sum['MedianDose'] = df.groupby('Source').agg({'Dose1': 'median'})
-    return df_sum
-
-
-def assign_partition_groups(df, partition_by='drug_pair'):
-    if partition_by == 'cell':
-        group = df['Sample']
-    elif partition_by == 'drug_pair':
-        df_info = load_drug_info()
-        id_dict = df_info[['ID', 'PUBCHEM']].drop_duplicates(['ID']).set_index('ID').iloc[:, 0]
-        group = df['Drug1'].copy()
-        group[(df['Drug2'].notnull()) & (df['Drug1'] <= df['Drug2'])] = df['Drug1'] + ',' + df['Drug2']
-        group[(df['Drug2'].notnull()) & (df['Drug1'] > df['Drug2'])] = df['Drug2'] + ',' + df['Drug1']
-        group2 = group.map(id_dict)
-        mapped = group2.notnull()
-        group[mapped] = group2[mapped]
-    elif partition_by == 'index':
-        group = df.reset_index()['index']
-    logger.info('Grouped response data by %s: %d groups', partition_by, group.nunique())
-    return group
-
-
-def dict_compare(d1, d2, ignore=[], expand=False):
-    d1_keys = set(d1.keys()) - set(ignore)
-    d2_keys = set(d2.keys()) - set(ignore)
-    intersect_keys = d1_keys.intersection(d2_keys)
-    added = d1_keys - d2_keys
-    removed = d2_keys - d1_keys
-    modified = set({x: (d1[x], d2[x]) for x in intersect_keys if d1[x] != d2[x]})
-    common = set(x for x in intersect_keys if d1[x] == d2[x])
-    equal = not (added or removed or modified)
-    if expand:
-        return equal, added, removed, modified, common
-    else:
-        return equal, added | removed | modified
-
-
-def values_or_dataframe(df, contiguous=False, dataframe=False):
-    if dataframe:
-        return df
-    mat = df.values
-    if contiguous:
-        mat = np.ascontiguousarray(mat)
-    return mat
-
-
-class CombinedDataLoader(object):
-    def __init__(self, seed=SEED):
-        self.seed = seed
-
-    def load_from_cache(self, cache, params):
-        """ NOTE: How does this function return an error? (False?) -Wozniak """
-        param_fname = '{}.params.json'.format(cache)
-        if not os.path.isfile(param_fname):
-            logger.warning('Cache parameter file does not exist: %s', param_fname)
-            return False
-        with open(param_fname) as param_file:
-            try:
-                cached_params = json.load(param_file)
-            except json.JSONDecodeError as e:
-                logger.warning('Could not decode parameter file %s: %s', param_fname, e)
-                return False
-        ignore_keys = ['cache', 'partition_by', 'single', 'use_exported_data']
-        equal, diffs = dict_compare(params, cached_params, ignore_keys)
-        if not equal:
-            logger.warning('Cache parameter mismatch: %s\nSaved: %s\nAttempted to load: %s', diffs, cached_params, params)
-            logger.warning('\nRemove %s to rebuild data cache.\n', param_fname)
-            raise ValueError('Could not load from a cache with incompatible keys:', diffs)
-        else:
-            fname = '{}.pkl'.format(cache)
-            if not os.path.isfile(fname):
-                logger.warning('Cache file does not exist: %s', fname)
-                return False
-            with open(fname, 'rb') as f:
-                obj = pickle.load(f)
-            self.__dict__.update(obj.__dict__)
-            logger.info('Loaded data from cache: %s', fname)
-            return True
-        # NOTE: This is unreachable -Wozniak
-        return False
-
-    def save_to_cache(self, cache, params):
-        for k in ['self', 'cache', 'single']:
-            if k in params:
-                del params[k]
-        dirname = os.path.dirname(cache)
-        if not os.path.exists(dirname):
-            logger.debug('Creating directory for cache: %s', dirname)
-            os.mkdir(dirname)
-        param_fname = '{}.params.json'.format(cache)
-        with open(param_fname, 'w') as param_file:
-            json.dump(params, param_file, sort_keys=True)
-        fname = '{}.pkl'.format(cache)
-        with open(fname, 'wb') as f:
-            pickle.dump(self, f, pickle.HIGHEST_PROTOCOL)
-        logger.info('Saved data to cache: %s', fname)
-
-    def partition_data(self, partition_by=None, cv_folds=1, train_split=0.7, val_split=0.2,
-                       cell_types=None, by_cell=None, by_drug=None,
-                       cell_subset_path=None, drug_subset_path=None):
-
-        seed = self.seed
-        train_sep_sources = self.train_sep_sources
-        test_sep_sources = self.test_sep_sources
-        df_response = self.df_response
-
-        if not partition_by:
-            if by_drug and by_cell:
-                partition_by = 'index'
-            elif by_drug:
-                partition_by = 'cell'
-            else:
-                partition_by = 'drug_pair'
-
-        if partition_by != self.partition_by:
-            df_response = df_response.assign(Group=assign_partition_groups(df_response, partition_by))
-
-        mask = df_response['Source'].isin(train_sep_sources)
-        test_mask = df_response['Source'].isin(test_sep_sources)
-
-        if by_drug:
-            drug_ids = drug_name_to_ids(by_drug)
-            logger.info('Mapped drug IDs for %s: %s', by_drug, drug_ids)
-            mask &= (df_response['Drug1'].isin(drug_ids)) & (df_response['Drug2'].isnull())
-            test_mask &= (df_response['Drug1'].isin(drug_ids)) & (df_response['Drug2'].isnull())
-
-        if by_cell:
-            cell_ids = cell_name_to_ids(by_cell)
-            logger.info('Mapped sample IDs for %s: %s', by_cell, cell_ids)
-            mask &= (df_response['Sample'].isin(cell_ids))
-            test_mask &= (df_response['Sample'].isin(cell_ids))
-
-        if cell_subset_path:
-            cell_subset = read_set_from_file(cell_subset_path)
-            mask &= (df_response['Sample'].isin(cell_subset))
-            test_mask &= (df_response['Sample'].isin(cell_subset))
-
-        if drug_subset_path:
-            drug_subset = read_set_from_file(drug_subset_path)
-            mask &= (df_response['Drug1'].isin(drug_subset)) & ((df_response['Drug2'].isnull()) | (df_response['Drug2'].isin(drug_subset)))
-            test_mask &= (df_response['Drug1'].isin(drug_subset)) & ((df_response['Drug2'].isnull()) | (df_response['Drug2'].isin(drug_subset)))
-
-        if cell_types:
-            df_type = load_cell_metadata()
-            cell_ids = set()
-            for cell_type in cell_types:
-                cells = df_type[~df_type['TUMOR_TYPE'].isnull() & df_type['TUMOR_TYPE'].str.contains(cell_type, case=False)]
-                cell_ids |= set(cells['ANL_ID'].tolist())
-                logger.info('Mapped sample tissue types for %s: %s', cell_type, set(cells['TUMOR_TYPE'].tolist()))
-            mask &= (df_response['Sample'].isin(cell_ids))
-            test_mask &= (df_response['Sample'].isin(cell_ids))
-
-        df_group = df_response[mask]['Group'].drop_duplicates().reset_index(drop=True)
-
-        if cv_folds > 1:
-            selector = KFold(n_splits=cv_folds, shuffle=True, random_state=seed)
-        else:
-            selector = ShuffleSplit(n_splits=1, train_size=train_split, test_size=val_split, random_state=seed)
-
-        splits = selector.split(df_group)
-
-        train_indexes = []
-        val_indexes = []
-        test_indexes = []
-
-        for index, (train_group_index, val_group_index) in enumerate(splits):
-            train_groups = set(df_group.values[train_group_index])
-            val_groups = set(df_group.values[val_group_index])
-            train_index = df_response.index[df_response['Group'].isin(train_groups) & mask]
-            val_index = df_response.index[df_response['Group'].isin(val_groups) & mask]
-            test_index = df_response.index[~df_response['Group'].isin(train_groups) & ~df_response['Group'].isin(val_groups) & test_mask]
-
-            train_indexes.append(train_index)
-            val_indexes.append(val_index)
-            test_indexes.append(test_index)
-            if logger.isEnabledFor(logging.DEBUG):
-                logger.debug('CV fold %d: train data = %s, val data = %s, test data = %s', index, train_index.shape[0], val_index.shape[0], test_index.shape[0])
-                logger.debug('  train groups (%d): %s', df_response.loc[train_index]['Group'].nunique(), df_response.loc[train_index]['Group'].unique())
-                logger.debug('  val groups ({%d}): %s', df_response.loc[val_index]['Group'].nunique(), df_response.loc[val_index]['Group'].unique())
-                logger.debug('  test groups ({%d}): %s', df_response.loc[test_index]['Group'].nunique(), df_response.loc[test_index]['Group'].unique())
-
-        self.partition_by = partition_by
-        self.cv_folds = cv_folds
-        self.train_indexes = train_indexes
-        self.val_indexes = val_indexes
-        self.test_indexes = test_indexes
-
-    def build_feature_list(self, single=False):
-        input_features = collections.OrderedDict()
-        feature_shapes = collections.OrderedDict()
-
-        if not self.agg_dose:
-            doses = ['dose1', 'dose2'] if not single else ['dose1']
-            for dose in doses:
-                input_features[dose] = 'dose'
-                feature_shapes['dose'] = (1,)
-
-        if self.encode_response_source:
-            input_features['response.source'] = 'response.source'
-            feature_shapes['response.source'] = (self.df_source.shape[1] - 1,)
-
-        for fea in self.cell_features:
-            feature_type = 'cell.' + fea
-            feature_name = 'cell.' + fea
-            df_cell = getattr(self, self.cell_df_dict[fea])
-            input_features[feature_name] = feature_type
-            feature_shapes[feature_type] = (df_cell.shape[1] - 1,)
-
-        drugs = ['drug1', 'drug2'] if not single else ['drug1']
-        for drug in drugs:
-            for fea in self.drug_features:
-                feature_type = 'drug.' + fea
-                feature_name = drug + '.' + fea
-                df_drug = getattr(self, self.drug_df_dict[fea])
-                input_features[feature_name] = feature_type
-                feature_shapes[feature_type] = (df_drug.shape[1] - 1,)
-
-        input_dim = sum([np.prod(feature_shapes[x]) for x in input_features.values()])
-
-        self.input_features = input_features
-        self.feature_shapes = feature_shapes
-        self.input_dim = input_dim
-
-        logger.info('Input features shapes:')
-        for k, v in self.input_features.items():
-            logger.info('  {}: {}'.format(k, self.feature_shapes[v]))
-        logger.info('Total input dimensions: {}'.format(self.input_dim))
-
-    def load(self, cache=None, ncols=None, scaling='std', dropna=None,
-             agg_dose=None, embed_feature_source=True, encode_response_source=True,
-             cell_features=['rnaseq'], drug_features=['descriptors', 'fingerprints'],
-             cell_feature_subset_path=None, drug_feature_subset_path=None,
-             drug_lower_response=1, drug_upper_response=-1, drug_response_span=0,
-             drug_median_response_min=-1, drug_median_response_max=1,
-             use_landmark_genes=False, use_filtered_genes=False, use_exported_data=None,
-             preprocess_rnaseq=None, single=False,
-             # train_sources=['GDSC', 'CTRP', 'ALMANAC', 'NCI60'],
-             train_sources=['GDSC', 'CTRP', 'ALMANAC'],
-             # val_sources='train',
-             # test_sources=['CCLE', 'gCSI'],
-             test_sources=['train'],
-             partition_by='drug_pair'):
-
-        params = locals().copy()
-        del params['self']
-
-        if not cell_features or 'none' in [x.lower() for x in cell_features]:
-            cell_features = []
-
-        if not drug_features or 'none' in [x.lower() for x in drug_features]:
-            drug_features = []
-
-        if cache and self.load_from_cache(cache, params):
-            self.build_feature_list(single=single)
-            return
-
-        # rebuild cache equivalent from the exported dataset
-        if use_exported_data is not None:
-            with pd.HDFStore(use_exported_data, 'r') as store:
-                if '/model' in store.keys():
-                    self.input_features = store.get_storer('model').attrs.input_features
-                    self.feature_shapes = store.get_storer('model').attrs.feature_shapes
-                    self.input_dim = sum([np.prod(self.feature_shapes[x]) for x in self.input_features.values()])
-                    self.test_sep_sources = []
-                    return
-                else:
-                    logger.warning('\nExported dataset does not have model info. Please rebuild the dataset.\n')
-                    raise ValueError('Could not load model info from the dataset:', use_exported_data)
-
-        logger.info('Loading data from scratch ...')
-
-        if agg_dose:
-            df_response = load_aggregated_single_response(target=agg_dose, combo_format=True)
-        else:
-            df_response = load_combined_dose_response()
-
-        if logger.isEnabledFor(logging.INFO):
-            logger.info('Summary of combined dose response by source:')
-            logger.info(summarize_response_data(df_response, target=agg_dose))
-
-        all_sources = df_response['Source'].unique()
-        df_source = encode_sources(all_sources)
-
-        if 'all' in train_sources:
-            train_sources = all_sources
-        if 'all' in test_sources:
-            test_sources = all_sources
-        elif 'train' in test_sources:
-            test_sources = train_sources
-
-        train_sep_sources = [x for x in all_sources for y in train_sources if x.startswith(y)]
-        test_sep_sources = [x for x in all_sources for y in test_sources if x.startswith(y)]
-
-        ids1 = df_response[['Drug1']].drop_duplicates().rename(columns={'Drug1': 'Drug'})
-        ids2 = df_response[['Drug2']].drop_duplicates().rename(columns={'Drug2': 'Drug'})
-        df_drugs_with_response = pd.concat([ids1, ids2]).drop_duplicates().dropna().reset_index(drop=True)
-        df_cells_with_response = df_response[['Sample']].drop_duplicates().reset_index(drop=True)
-        logger.info('Combined raw dose response data has %d unique samples and %d unique drugs', df_cells_with_response.shape[0], df_drugs_with_response.shape[0])
-
-        if agg_dose:
-            df_selected_drugs = None
-        else:
-            logger.info('Limiting drugs to those with response min <= %g, max >= %g, span >= %g, median_min <= %g, median_max >= %g ...', drug_lower_response, drug_upper_response, drug_response_span, drug_median_response_min, drug_median_response_max)
-            df_selected_drugs = select_drugs_with_response_range(df_response, span=drug_response_span, lower=drug_lower_response, upper=drug_upper_response, lower_median=drug_median_response_min, upper_median=drug_median_response_max)
-            logger.info('Selected %d drugs from %d', df_selected_drugs.shape[0], df_response['Drug1'].nunique())
-
-        cell_feature_subset = read_set_from_file(cell_feature_subset_path)
-        drug_feature_subset = read_set_from_file(drug_feature_subset_path)
-
-        for fea in cell_features:
-            fea = fea.lower()
-            if fea == 'rnaseq' or fea == 'expression':
-                df_cell_rnaseq = load_cell_rnaseq(ncols=ncols, scaling=scaling, use_landmark_genes=use_landmark_genes, use_filtered_genes=use_filtered_genes, feature_subset=cell_feature_subset, preprocess_rnaseq=preprocess_rnaseq, embed_feature_source=embed_feature_source)
-
-        for fea in drug_features:
-            fea = fea.lower()
-            if fea == 'descriptors':
-                df_drug_desc = load_drug_descriptors(ncols=ncols, scaling=scaling, dropna=dropna, feature_subset=drug_feature_subset)
-            elif fea == 'fingerprints':
-                df_drug_fp = load_drug_fingerprints(ncols=ncols, scaling=scaling, dropna=dropna, feature_subset=drug_feature_subset)
-            elif fea == 'mordred':
-                df_drug_mordred = load_mordred_descriptors(ncols=ncols, scaling=scaling, dropna=dropna, feature_subset=drug_feature_subset)
-
-        # df_drug_desc, df_drug_fp = load_drug_data(ncols=ncols, scaling=scaling, dropna=dropna)
-
-        cell_df_dict = {'rnaseq': 'df_cell_rnaseq'}
-
-        drug_df_dict = {'descriptors': 'df_drug_desc',
-                        'fingerprints': 'df_drug_fp',
-                        'mordred': 'df_drug_mordred'}
-
-        # df_cell_ids = df_cell_rnaseq[['Sample']].drop_duplicates()
-        # df_drug_ids = pd.concat([df_drug_desc[['Drug']], df_drug_fp[['Drug']]]).drop_duplicates()
-
-        logger.info('Filtering drug response data...')
-
-        df_cell_ids = df_cells_with_response
-        for fea in cell_features:
-            df_cell = locals()[cell_df_dict[fea]]
-            df_cell_ids = df_cell_ids.merge(df_cell[['Sample']]).drop_duplicates()
-        logger.info('  %d molecular samples with feature and response data', df_cell_ids.shape[0])
-
-        df_drug_ids = df_drugs_with_response
-        for fea in drug_features:
-            df_drug = locals()[drug_df_dict[fea]]
-            df_drug_ids = df_drug_ids.merge(df_drug[['Drug']]).drop_duplicates()
-
-        if df_selected_drugs is not None:
-            df_drug_ids = df_drug_ids.merge(df_selected_drugs).drop_duplicates()
-        logger.info('  %d selected drugs with feature and response data', df_drug_ids.shape[0])
-
-        df_response = df_response[df_response['Sample'].isin(df_cell_ids['Sample']) &
-                                  df_response['Drug1'].isin(df_drug_ids['Drug']) &
-                                  (df_response['Drug2'].isin(df_drug_ids['Drug']) | df_response['Drug2'].isnull())]
-
-        df_response = df_response[df_response['Source'].isin(train_sep_sources + test_sep_sources)]
-
-        df_response.reset_index(drop=True, inplace=True)
-
-        if logger.isEnabledFor(logging.INFO):
-            logger.info('Summary of filtered dose response by source:')
-            logger.info(summarize_response_data(df_response, target=agg_dose))
-
-        df_response = df_response.assign(Group=assign_partition_groups(df_response, partition_by))
-
-        self.agg_dose = agg_dose
-        self.cell_features = cell_features
-        self.drug_features = drug_features
-        self.cell_df_dict = cell_df_dict
-        self.drug_df_dict = drug_df_dict
-        self.df_source = df_source
-        self.df_response = df_response
-        self.embed_feature_source = embed_feature_source
-        self.encode_response_source = encode_response_source
-        self.all_sources = all_sources
-        self.train_sources = train_sources
-        self.test_sources = test_sources
-        self.train_sep_sources = train_sep_sources
-        self.test_sep_sources = test_sep_sources
-        self.partition_by = partition_by
-
-        for var in (list(drug_df_dict.values()) + list(cell_df_dict.values())):
-            value = locals().get(var)
-            if value is not None:
-                setattr(self, var, value)
-
-        self.build_feature_list(single=single)
-
-        if cache:
-            self.save_to_cache(cache, params)
-
-
-class DataFeeder(keras.utils.data_utils.Sequence):
-    """Read from pre-joined dataset (HDF5 format) and feed data to the model.
-    """
-    def __init__(self, partition='train', filename=None, batch_size=32, shuffle=False, single=False, agg_dose=None):
-        self.partition = partition
-        self.filename = filename
-        self.batch_size = batch_size
-        self.shuffle = shuffle
-        self.single = single
-        self.agg_dose = agg_dose
-        self.target = agg_dose if agg_dose is not None else 'Growth'
-
-        self.store = pd.HDFStore(filename, mode='r')
-        self.input_size = len(list(filter(lambda x: x.startswith('/x_train'), self.store.keys())))
-        try:
-            y = self.store.select('y_{}'.format(self.partition))
-            self.index = y.index
-        except KeyError:
-            self.index = []
-
-        self.size = len(self.index)
-        if self.size >= self.batch_size:
-            self.steps = self.size // self.batch_size
-        else:
-            self.steps = 1
-            self.batch_size = self.size
-        self.index_map = np.arange(self.steps)
-        if self.shuffle:
-            np.random.shuffle(self.index_map)
-
-    def __len__(self):
-        return self.steps
-
-    def __getitem__(self, idx):
-        start = self.index_map[idx] * self.batch_size
-        stop = (self.index_map[idx] + 1) * self.batch_size
-        x = [self.store.select('x_{0}_{1}'.format(self.partition, i), start=start, stop=stop) for i in range(self.input_size)]
-        y = self.store.select('y_{}'.format(self.partition), start=start, stop=stop)[self.target]
-        return x, y
-
-    def getall(self):
-        start = 0
-        stop = self.size
-        x = [self.store.select('x_{0}_{1}'.format(self.partition, i), start=start, stop=stop) for i in range(self.input_size)]
-        y = self.store.select('y_{}'.format(self.partition), start=start, stop=stop)[self.target]
-        return x, y
-
-    def reset(self):
-        """ empty method implementation to match reset() in CombinedDataGenerator
-        """
-        pass
-
-    def get_response(self, copy=False):
-        if self.shuffle:
-            self.index = [item for step in range(self.steps) for item in range(self.index_map[step] * self.batch_size, (self.index_map[step] + 1) * self.batch_size)]
-            df = self.store.get('y_{}'.format(self.partition)).iloc[self.index, :]
-        else:
-            df = self.store.get('y_{}'.format(self.partition))
-
-        if self.agg_dose is None:
-            df['Dose1'] = self.store.get('x_{}_0'.format(self.partition)).iloc[self.index, :]
-            if not self.single:
-                df['Dose2'] = self.store.get('x_{}_1'.format(self.partition)).iloc[self.index, :]
-        return df.copy() if copy else df
-
-    def close(self):
-        self.store.close()
-
-
-class CombinedDataGenerator(keras.utils.data_utils.Sequence):
-    """Generate training, validation or testing batches from loaded data
-    """
-    def __init__(self, data, partition='train', fold=0, source=None, batch_size=32, shuffle=True, single=False, rank=0, total_ranks=1):
-        self.data = data
-        self.partition = partition
-        self.batch_size = batch_size
-        self.single = single
-
-        if partition == 'train':
-            index = data.train_indexes[fold]
-        elif partition == 'val':
-            index = data.val_indexes[fold]
-        else:
-            index = data.test_indexes[fold] if hasattr(data, 'test_indexes') else []
-
-        if source:
-            df = data.df_response[['Source']].iloc[index, :]
-            index = df.index[df['Source'] == source]
-
-        if shuffle:
-            index = np.random.permutation(index)
-
-        # sharing by rank
-        samples_per_rank = len(index) // total_ranks
-        samples_per_rank = self.batch_size * (samples_per_rank // self.batch_size)
-
-        self.index = index[rank * samples_per_rank:(rank + 1) * samples_per_rank]
-        self.index_cycle = cycle(self.index)
-        self.size = len(self.index)
-        self.steps = self.size // self.batch_size
-        print("partition:{0}, rank:{1}, sharded index size:{2}, batch_size:{3}, steps:{4}".format(partition, rank, self.size, self.batch_size, self.steps))
-
-    def __len__(self):
-        return self.steps
-
-    def __getitem__(self, idx):
-        shard = self.index[idx * self.batch_size:(idx + 1) * self.batch_size]
-        x_list, y = self.get_slice(self.batch_size, single=self.single, partial_index=shard)
-        return x_list, y
-
-    def reset(self):
-        self.index_cycle = cycle(self.index)
-
-    def get_response(self, copy=False):
-        df = self.data.df_response.iloc[self.index, :].drop(['Group'], axis=1)
-        return df.copy() if copy else df
-
-    def get_slice(self, size=None, contiguous=True, single=False, dataframe=False, partial_index=None):
-        size = size or self.size
-        single = single or self.data.agg_dose
-        target = self.data.agg_dose or 'Growth'
-
-        if partial_index is not None:
-            index = partial_index
-        else:
-            index = list(islice(self.index_cycle, size))
-        df_orig = self.data.df_response.iloc[index, :]
-        df = df_orig.copy()
-
-        if not single:
-            df['Swap'] = np.random.choice([True, False], df.shape[0])
-            swap = df_orig['Drug2'].notnull() & df['Swap']
-            df.loc[swap, 'Drug1'] = df_orig.loc[swap, 'Drug2']
-            df.loc[swap, 'Drug2'] = df_orig.loc[swap, 'Drug1']
-            if not self.data.agg_dose:
-                df['DoseSplit'] = np.random.uniform(0.001, 0.999, df.shape[0])
-                df.loc[swap, 'Dose1'] = df_orig.loc[swap, 'Dose2']
-                df.loc[swap, 'Dose2'] = df_orig.loc[swap, 'Dose1']
-
-        split = df_orig['Drug2'].isnull()
-        if not single:
-            df.loc[split, 'Drug2'] = df_orig.loc[split, 'Drug1']
-            if not self.data.agg_dose:
-                df.loc[split, 'Dose1'] = df_orig.loc[split, 'Dose1'] - np.log10(df.loc[split, 'DoseSplit'])
-                df.loc[split, 'Dose2'] = df_orig.loc[split, 'Dose1'] - np.log10(1 - df.loc[split, 'DoseSplit'])
-
-        if dataframe:
-            cols = [target, 'Sample', 'Drug1', 'Drug2'] if not single else [target, 'Sample', 'Drug1']
-            y = df[cols].reset_index(drop=True)
-        else:
-            y = values_or_dataframe(df[target], contiguous, dataframe)
-
-        x_list = []
-
-        if not self.data.agg_dose:
-            doses = ['Dose1', 'Dose2'] if not single else ['Dose1']
-            for dose in doses:
-                x = values_or_dataframe(df[[dose]].reset_index(drop=True), contiguous, dataframe)
-                x_list.append(x)
-
-        if self.data.encode_response_source:
-            df_x = pd.merge(df[['Source']], self.data.df_source, on='Source', how='left')
-            df_x.drop(['Source'], axis=1, inplace=True)
-            x = values_or_dataframe(df_x, contiguous, dataframe)
-            x_list.append(x)
-
-        for fea in self.data.cell_features:
-            df_cell = getattr(self.data, self.data.cell_df_dict[fea])
-            df_x = pd.merge(df[['Sample']], df_cell, on='Sample', how='left')
-            df_x.drop(['Sample'], axis=1, inplace=True)
-            x = values_or_dataframe(df_x, contiguous, dataframe)
-            x_list.append(x)
-
-        drugs = ['Drug1', 'Drug2'] if not single else ['Drug1']
-        for drug in drugs:
-            for fea in self.data.drug_features:
-                df_drug = getattr(self.data, self.data.drug_df_dict[fea])
-                df_x = pd.merge(df[[drug]], df_drug, left_on=drug, right_on='Drug', how='left')
-                df_x.drop([drug, 'Drug'], axis=1, inplace=True)
-                if dataframe and not single:
-                    df_x = df_x.add_prefix(drug + '.')
-                x = values_or_dataframe(df_x, contiguous, dataframe)
-                x_list.append(x)
-
-        # print(x_list, y)
-        return x_list, y
-
-    def flow(self, single=False):
-        while 1:
-            x_list, y = self.get_slice(self.batch_size, single=single)
-            yield x_list, y
-
-
-def test_generator(loader):
-    gen = CombinedDataGenerator(loader).flow()
-    x_list, y = next(gen)
-    print('x shapes:')
-    for x in x_list:
-        print(x.shape)
-    print('y shape:')
-    print(y.shape)
diff --git a/examples/python/keras/candle_uno/uno_default_model.txt b/examples/python/keras/candle_uno/uno_default_model.txt
deleted file mode 100644
index 72ddeb4547..0000000000
--- a/examples/python/keras/candle_uno/uno_default_model.txt
+++ /dev/null
@@ -1,33 +0,0 @@
-[Global_Params]
-train_sources=['GDSC', 'CTRP', 'ALMANAC']
-test_sources=['train']
-cell_types=None
-cell_features=['rnaseq']
-drug_features=['descriptors', 'fingerprints']
-dense=[1000, 1000, 1000]
-dense_feature_layers=[1000, 1000, 1000]
-activation='relu'
-loss='mse'
-optimizer='adam'
-scaling='std'
-dropout=0
-epochs=10
-batch_size=32
-val_split=0.2
-cv=1
-max_val_loss=1.0
-learning_rate=None
-base_lr=None
-residual=False
-reduce_lr=False
-warmup_lr=False
-batch_normalization=False
-feature_subsample=0
-rng_seed=2018
-save_path='save/uno'
-no_gen=False
-verbose = False
-gpus = [0]
-
-[Monitor_Params]
-timeout=3600
diff --git a/examples/python/keras/elementwise_max_min.py b/examples/python/keras/elementwise_max_min.py
deleted file mode 100644
index 95291f1273..0000000000
--- a/examples/python/keras/elementwise_max_min.py
+++ /dev/null
@@ -1,60 +0,0 @@
-from flexflow.keras.layers import Dense, Input, Maximum, Minimum
-import flexflow.keras.optimizers
-
-import numpy as np
-
-def elementwise_max():
-  input0 = Input(shape=(16*2,), dtype="float32")
-  input1 = Input(shape=(10*1,), dtype="float32")
-
-  x0 = Dense(20, activation='relu')(input0) # B,20
-  x1 = Dense(20, activation='relu')(input1) # B,20
-
-  f0 = Maximum()([x0, x1]) # B,20
-
-  out = Dense(1)(f0) # B,1
-
-  model = flexflow.keras.models.Model([input0, input1], out)
-
-  opt = flexflow.keras.optimizers.Adam(learning_rate=0.001)
-  model.compile(optimizer=opt, loss='mean_squared_error', metrics=['mean_squared_error'])
-  print(model.summary())
-  model.fit(
-    x = [
-      np.random.randn(300, 16*2).astype(np.float32),
-      np.random.randn(300, 10*1).astype(np.float32),
-    ],
-    y = np.random.randn(300, 1).astype(np.float32),
-    epochs = 2
-  )
-
-
-def elementwise_min():
-  input0 = Input(shape=(16*2,), dtype="float32")
-  input1 = Input(shape=(10*1,), dtype="float32")
-
-  x0 = Dense(20, activation='relu')(input0) # B,20
-  x1 = Dense(20, activation='relu')(input1) # B,20
-
-  f0 = Minimum()([x0, x1]) # B,20
-
-  out = Dense(1)(f0) # B,1
-
-  model = flexflow.keras.models.Model([input0, input1], out)
-
-  opt = flexflow.keras.optimizers.Adam(learning_rate=0.001)
-  model.compile(optimizer=opt, loss='mean_squared_error', metrics=['mean_squared_error'])
-  print(model.summary())
-  model.fit(
-    x = [
-      np.random.randn(300, 16*2).astype(np.float32),
-      np.random.randn(300, 10*1).astype(np.float32),
-    ],
-    y = np.random.randn(300, 1).astype(np.float32),
-    epochs = 2
-  )
-
-
-if __name__ == '__main__':
-    elementwise_max()
-    elementwise_min()
diff --git a/examples/python/keras/elementwise_mul_broadcast.py b/examples/python/keras/elementwise_mul_broadcast.py
deleted file mode 100644
index d68476a6cb..0000000000
--- a/examples/python/keras/elementwise_mul_broadcast.py
+++ /dev/null
@@ -1,99 +0,0 @@
-from flexflow.keras.layers import Dense, Input, Reshape, Multiply
-import flexflow.keras.optimizers
-
-import numpy as np
-
-def broadcast1():
-  input0 = Input(shape=(16*2,), dtype="float32")
-  input1 = Input(shape=(10*1,), dtype="float32")
-
-  x0 = Dense(20, activation='relu')(input0) # B,20
-  x1 = Dense(10, activation='relu')(input1) # B,10
-
-  nx0 = Reshape((10,2))(x0) # B,10,2
-  nx1 = Reshape((10,1))(x1) # B,10,1
-
-  m0 = Multiply()([nx1, nx0]) # B,10,2
-  f0 = Reshape((20,))(m0) # B,20
-
-  out = Dense(1)(f0) # B,1
-
-  model = flexflow.keras.models.Model([input0, input1], out)
-
-  opt = flexflow.keras.optimizers.Adam(learning_rate=0.001)
-  model.compile(optimizer=opt, loss='mean_squared_error', metrics=['mean_squared_error'])
-  print(model.summary())
-  model.fit(
-    x = [
-      np.random.randn(300, 16*2).astype(np.float32),
-      np.random.randn(300, 10*1).astype(np.float32),
-    ],
-    y = np.random.randn(300, 1).astype(np.float32),
-    epochs = 2
-  )
-
-
-def broadcast2():
-  input0 = Input(shape=(16*2,), dtype="float32")
-  input1 = Input(shape=(10*1,), dtype="float32")
-
-  x0 = Dense(20, activation='relu')(input0) # B,20
-  x1 = Dense(10, activation='relu')(input1) # B,10
-
-  nx0 = Reshape((10,2))(x0) # B,10,2
-  nx1 = Reshape((10,1))(x1) # B,10,1
-
-  m0 = Multiply()([nx0, nx1]) # B,10,2
-  f0 = Reshape((20,))(m0) # B,20
-
-  out = Dense(1)(f0) # B,1
-
-  model = flexflow.keras.models.Model([input0, input1], out)
-
-  opt = flexflow.keras.optimizers.Adam(learning_rate=0.001)
-  model.compile(optimizer=opt, loss='mean_squared_error', metrics=['mean_squared_error'])
-  print(model.summary())
-  model.fit(
-    x = [
-      np.random.randn(300, 16*2).astype(np.float32),
-      np.random.randn(300, 10*1).astype(np.float32),
-    ],
-    y = np.random.randn(300, 1).astype(np.float32),
-    epochs = 2
-  )
-
-
-def broadcast_both():
-  input0 = Input(shape=(16*2,), dtype="float32")
-  input1 = Input(shape=(10*1,), dtype="float32")
-
-  x0 = Dense(2, activation='relu')(input0) # B,20
-  x1 = Dense(10, activation='relu')(input1) # B,10
-
-  nx0 = Reshape((1,2))(x0) # B,1,2
-  nx1 = Reshape((10,1))(x1) # B,10,1
-
-  m0 = Multiply()([nx0, nx1]) # B,10,2
-  f0 = Reshape((20,))(m0) # B,20
-
-  out = Dense(1)(f0) # B,1
-
-  model = flexflow.keras.models.Model([input0, input1], out)
-
-  opt = flexflow.keras.optimizers.Adam(learning_rate=0.001)
-  model.compile(optimizer=opt, loss='mean_squared_error', metrics=['mean_squared_error'])
-  print(model.summary())
-  model.fit(
-    x = [
-      np.random.randn(300, 16*2).astype(np.float32),
-      np.random.randn(300, 10*1).astype(np.float32),
-    ],
-    y = np.random.randn(300, 1).astype(np.float32),
-    epochs = 2
-  )
-
-
-if __name__ == '__main__':
-    broadcast1()
-    broadcast2()
-    broadcast_both()
diff --git a/examples/python/keras/func_cifar10_alexnet.py b/examples/python/keras/func_cifar10_alexnet.py
deleted file mode 100644
index c0ade0b722..0000000000
--- a/examples/python/keras/func_cifar10_alexnet.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from flexflow.keras.models import Model, Sequential
-from flexflow.keras.layers import Input, Flatten, Dense, Activation, Conv2D, MaxPooling2D, Concatenate, concatenate
-import flexflow.keras.optimizers
-from flexflow.keras.datasets import mnist
-from flexflow.keras.datasets import cifar10
-from flexflow.keras import losses
-from flexflow.keras import metrics
-from flexflow.keras.callbacks import Callback, VerifyMetrics, EpochVerifyMetrics
-from accuracy import ModelAccuracy
-
-import flexflow.core as ff
-import numpy as np
-import argparse
-import gc
-
-from PIL import Image
-  
-def top_level_task():
-  num_samples = 10000
-  
-  (x_train, y_train), (x_test, y_test) = cifar10.load_data(num_samples)
-
-  full_input_np = np.zeros((num_samples, 3, 229, 229), dtype=np.float32)
-  for i in range(0, num_samples):
-    image = x_train[i, :, :, :]
-    image = image.transpose(1, 2, 0)
-    pil_image = Image.fromarray(image)
-    pil_image = pil_image.resize((229,229), Image.Resampling.NEAREST)
-    image = np.array(pil_image, dtype=np.float32)
-    image = image.transpose(2, 0, 1)
-    full_input_np[i, :, :, :] = image
-    if (i == 0):
-      print(image)
-  
-  full_input_np /= 255    
-  y_train = y_train.astype('int32')
-  full_label_np = y_train
-  
-  input_tensor = Input(shape=(3, 229, 229), dtype="float32")
-  
-  output = Conv2D(filters=64, input_shape=(3,229,229), kernel_size=(11,11), strides=(4,4), padding=(2,2), activation="relu")(input_tensor)
-  output = MaxPooling2D(pool_size=(3,3), strides=(2,2), padding="valid")(output)
-  output = Conv2D(filters=192, kernel_size=(5,5), strides=(1,1), padding=(2,2), activation="relu")(output)
-  output = MaxPooling2D(pool_size=(3,3), strides=(2,2), padding="valid")(output)
-  output = Conv2D(filters=384, kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu")(output)
-  output = Conv2D(filters=256, kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu")(output)
-  output = Conv2D(filters=256, kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu")(output)
-  output = MaxPooling2D(pool_size=(3,3), strides=(2,2), padding="valid")(output)
-  output = Flatten()(output)
-  output = Dense(4096, activation="relu")(output)
-  output = Dense(4096, activation="relu")(output)
-  output = Dense(10)(output)
-  output = Activation("softmax")(output)
-  
-  model = Model(input_tensor, output)
-  
-  opt = flexflow.keras.optimizers.SGD(learning_rate=0.01)
-  model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])
-  print(model.summary())
-  
-  model.fit(full_input_np, full_label_np, epochs=160, callbacks=[VerifyMetrics(ModelAccuracy.CIFAR10_ALEXNET), EpochVerifyMetrics(ModelAccuracy.CIFAR10_ALEXNET)])
-
-if __name__ == "__main__":
-  print("Functional API, cifar10 alexnet")
-  top_level_task()
-  gc.collect()
diff --git a/examples/python/keras/func_cifar10_cnn.py b/examples/python/keras/func_cifar10_cnn.py
deleted file mode 100644
index 423541386f..0000000000
--- a/examples/python/keras/func_cifar10_cnn.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from flexflow.keras.models import Model, Sequential
-from flexflow.keras.layers import Input, Flatten, Dense, Activation, Conv2D, MaxPooling2D, Concatenate, concatenate
-import flexflow.keras.optimizers
-from flexflow.keras.datasets import mnist
-from flexflow.keras.datasets import cifar10
-from flexflow.keras import losses
-from flexflow.keras import metrics
-from flexflow.keras.callbacks import Callback, VerifyMetrics, EpochVerifyMetrics
-from accuracy import ModelAccuracy
-
-import flexflow.core as ff
-import numpy as np
-import argparse
-import gc
-  
-def top_level_task():
-  num_classes = 10
-  
-  num_samples = 10000
-  
-  (x_train, y_train), (x_test, y_test) = cifar10.load_data(num_samples)
-  
-  x_train = x_train.astype('float32')
-  x_train /= 255
-  y_train = y_train.astype('int32')
-  print("shape: ", x_train.shape)
-  
-  input_tensor1 = Input(shape=(3, 32, 32), dtype="float32")
-  
-  output_tensor = Conv2D(filters=32, input_shape=(3,32,32), kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu")(input_tensor1)
-  output_tensor = Conv2D(filters=32, kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu")(output_tensor)
-  output_tensor = MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="valid")(output_tensor)
-  output_tensor = Conv2D(filters=64, kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu")(output_tensor)
-  output_tensor = Conv2D(filters=64, kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu")(output_tensor)
-  output_tensor = MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="valid")(output_tensor)
-  output_tensor = Flatten()(output_tensor)
-  output_tensor = Dense(512, activation="relu")(output_tensor)
-  output_tensor = Dense(num_classes)(output_tensor)
-  output_tensor = Activation("softmax")(output_tensor)
-
-  model = Model(input_tensor1, output_tensor)
-  
-  opt = flexflow.keras.optimizers.SGD(learning_rate=0.01)
-  model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])
-  print(model.summary())
-
-  model.fit(x_train, y_train, epochs=160, callbacks=[VerifyMetrics(ModelAccuracy.CIFAR10_CNN), EpochVerifyMetrics(ModelAccuracy.CIFAR10_CNN)])
-
-if __name__ == "__main__":
-  print("Functional API, cifar10 cnn")
-  top_level_task()
-  gc.collect()
diff --git a/examples/python/keras/func_cifar10_cnn_concat.py b/examples/python/keras/func_cifar10_cnn_concat.py
deleted file mode 100644
index 72dfdeffaf..0000000000
--- a/examples/python/keras/func_cifar10_cnn_concat.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from flexflow.keras.models import Model, Sequential
-from flexflow.keras.layers import Input, Flatten, Dense, Activation, Conv2D, MaxPooling2D, Concatenate, concatenate
-import flexflow.keras.optimizers
-from flexflow.keras.datasets import mnist
-from flexflow.keras.datasets import cifar10
-from flexflow.keras import losses
-from flexflow.keras import metrics
-from flexflow.keras.callbacks import Callback, VerifyMetrics, EpochVerifyMetrics
-from accuracy import ModelAccuracy
-
-import flexflow.core as ff
-import numpy as np
-import argparse
-import gc
-  
-def cifar_cnn_sub(input_tensor, name_postfix):
-  name = "conv2d_0_" + str(name_postfix)
-  t1 = Conv2D(filters=32, input_shape=(3,32,32), kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu", name=name)(input_tensor)
-  name = "conv2d_1_" + str(name_postfix)
-  ot1 = Conv2D(filters=32, kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu", name=name)(t1)
-  return ot1
-    
-def top_level_task():
-  num_classes = 10
-  
-  num_samples = 10000
-  
-  (x_train, y_train), (x_test, y_test) = cifar10.load_data(num_samples)
-  
-  x_train = x_train.astype('float32')
-  x_train /= 255
-  y_train = y_train.astype('int32')
-  print("shape: ", x_train.shape)
-  
-  input_tensor1 = Input(shape=(3, 32, 32), dtype="float32", name="input1")
-  input_tensor2 = Input(shape=(3, 32, 32), dtype="float32", name="input2")
-
-  ot1 = cifar_cnn_sub(input_tensor1, 1)
-  ot2 = cifar_cnn_sub(input_tensor2, 2)
-  ot3 = cifar_cnn_sub(input_tensor2, 3)
-  output_tensor = Concatenate(axis=1)([ot1, ot2, ot3])
-  output_tensor = MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="valid")(output_tensor)
-  o1 = Conv2D(filters=64, kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu", name="conv2d_0_4")(output_tensor)
-  o2 = Conv2D(filters=64, kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu", name="conv2d_1_4")(output_tensor)
-  output_tensor = Concatenate(axis=1)([o1, o2])
-  output_tensor = Conv2D(filters=64, kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu")(output_tensor)
-  output_tensor = MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="valid")(output_tensor)
-  output_tensor = Flatten()(output_tensor)
-  output_tensor = Dense(512, activation="relu")(output_tensor)
-  output_tensor = Dense(num_classes)(output_tensor)
-  output_tensor = Activation("softmax")(output_tensor)
-
-  model = Model([input_tensor1, input_tensor2], output_tensor)
-  
-  opt = flexflow.keras.optimizers.SGD(learning_rate=0.01)
-  model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])
-  print(model.summary())
-
-  model.fit([x_train, x_train], y_train, epochs=160, callbacks=[VerifyMetrics(ModelAccuracy.CIFAR10_CNN), EpochVerifyMetrics(ModelAccuracy.CIFAR10_CNN)])
-
-if __name__ == "__main__":
-  print("Functional API, cifar10 cnn concat")
-  top_level_task()
-  gc.collect()
diff --git a/examples/python/keras/func_cifar10_cnn_concat_model.py b/examples/python/keras/func_cifar10_cnn_concat_model.py
deleted file mode 100644
index 39885bac8c..0000000000
--- a/examples/python/keras/func_cifar10_cnn_concat_model.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# 
-from flexflow.keras.models import Model, Sequential
-from flexflow.keras.layers import Input, Flatten, Dense, Activation, Conv2D, MaxPooling2D, Concatenate, concatenate
-import flexflow.keras.optimizers
-from flexflow.keras.datasets import mnist
-from flexflow.keras.datasets import cifar10
-from flexflow.keras import losses
-from flexflow.keras import metrics
-from flexflow.keras.callbacks import Callback, VerifyMetrics, EpochVerifyMetrics
-from accuracy import ModelAccuracy
-
-import flexflow.core as ff
-import numpy as np
-import argparse
-import gc
-  
-def cifar_cnn_sub(input_tensor, name_postfix):
-  name = "conv2d_0_" + str(name_postfix)
-  t1 = Conv2D(filters=32, input_shape=(3,32,32), kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu", name=name)(input_tensor)
-  name = "conv2d_1_" + str(name_postfix)
-  ot1 = Conv2D(filters=32, kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu", name=name)(t1)
-  return ot1
-    
-def top_level_task():
-  num_classes = 10
-  
-  num_samples = 10000
-  
-  (x_train, y_train), (x_test, y_test) = cifar10.load_data(num_samples)
-  
-  x_train = x_train.astype('float32')
-  x_train /= 255
-  y_train = y_train.astype('int32')
-  print("shape: ", x_train.shape)
-  
-  input_tensor1 = Input(shape=(3, 32, 32), dtype="float32", name="input1")
-  input_tensor2 = Input(shape=(3, 32, 32), dtype="float32", name="input2")
-
-  ot1 = cifar_cnn_sub(input_tensor1, 1)
-  model1 = Model(input_tensor1, ot1)
-  print(model1.summary())
-  ot2 = cifar_cnn_sub(input_tensor2, 2)
-  model2 = Model(input_tensor2, ot2)
-  print(model2.summary())
-  output_tensor = Concatenate(axis=1)([model1.output, model2.output])
-  output_tensor = MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="valid")(output_tensor)
-  output_tensor = Conv2D(filters=64, kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu", name="conv2d_0_4")(output_tensor)
-  output_tensor = Conv2D(filters=64, kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu")(output_tensor)
-  output_tensor = MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="valid")(output_tensor)
-  output_tensor = Flatten()(output_tensor)
-  output_tensor = Dense(512, activation="relu")(output_tensor)
-  output_tensor = Dense(num_classes)(output_tensor)
-  output_tensor = Activation("softmax")(output_tensor)
-
-  model = Model([input_tensor1, input_tensor2], output_tensor)
- # print(model.summary())
-  
-  opt = flexflow.keras.optimizers.SGD(learning_rate=0.01)
-  model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])
-  print(model.summary())
-
-  model.fit([x_train, x_train], y_train, epochs=160, callbacks=[VerifyMetrics(ModelAccuracy.CIFAR10_CNN), EpochVerifyMetrics(ModelAccuracy.CIFAR10_CNN)])
-
-if __name__ == "__main__":
-  print("Functional API, cifar10 cnn concat model")
-  top_level_task()
-  gc.collect()
diff --git a/examples/python/keras/func_cifar10_cnn_concat_seq_model.py b/examples/python/keras/func_cifar10_cnn_concat_seq_model.py
deleted file mode 100644
index cda95beb49..0000000000
--- a/examples/python/keras/func_cifar10_cnn_concat_seq_model.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from flexflow.keras.models import Model, Sequential
-from flexflow.keras.layers import Input, Flatten, Dense, Activation, Conv2D, MaxPooling2D, Concatenate, concatenate
-import flexflow.keras.optimizers
-from flexflow.keras.datasets import mnist
-from flexflow.keras.datasets import cifar10
-from flexflow.keras import losses
-from flexflow.keras import metrics
-from flexflow.keras.callbacks import Callback, VerifyMetrics, EpochVerifyMetrics
-from accuracy import ModelAccuracy
-
-import flexflow.core as ff
-import numpy as np
-import argparse
-import gc
-   
-def top_level_task():
-  num_classes = 10
-  
-  num_samples = 10000
-  
-  (x_train, y_train), (x_test, y_test) = cifar10.load_data(num_samples)
-  
-  x_train = x_train.astype('float32')
-  x_train /= 255
-  y_train = y_train.astype('int32')
-  print("shape: ", x_train.shape)
-
-  model1 = Sequential()
-  model1.add(Conv2D(filters=32, input_shape=(3,32,32), kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu", name="conv2d_0_0"))
-  model1.add(Conv2D(filters=32, kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu", name="conv2d_1_0"))
-  print(model1.summary())
-
-  model2 = Sequential()
-  model2.add(Conv2D(filters=32, input_shape=(3,32,32), kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu", name="conv2d_0_1"))
-  model2.add(Conv2D(filters=32, kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu", name="conv2d_1_1"))
-  print(model2.summary())
-  
-  output_tensor = Concatenate(axis=1)([model1.output, model2.output])
-  output_tensor = MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="valid")(output_tensor)
-  output_tensor = Conv2D(filters=64, kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu", name="conv2d_0_4")(output_tensor)
-  output_tensor = Conv2D(filters=64, kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu")(output_tensor)
-  output_tensor = MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="valid")(output_tensor)
-  output_tensor = Flatten()(output_tensor)
-  output_tensor = Dense(512, activation="relu")(output_tensor)
-  output_tensor = Dense(num_classes)(output_tensor)
-  output_tensor = Activation("softmax")(output_tensor)
-
-  model = Model([model1.input[0], model2.input[0]], output_tensor)
-  
-  opt = flexflow.keras.optimizers.SGD(learning_rate=0.01)
-  model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])
-  print(model.summary())
-
-  model.fit([x_train, x_train], y_train, epochs=160, callbacks=[VerifyMetrics(ModelAccuracy.CIFAR10_CNN), EpochVerifyMetrics(ModelAccuracy.CIFAR10_CNN)])
-
-if __name__ == "__main__":
-  print("Functional API, cifar10 cnn concat sequential model")
-  top_level_task()
-  gc.collect()
diff --git a/examples/python/keras/func_cifar10_cnn_nested.py b/examples/python/keras/func_cifar10_cnn_nested.py
deleted file mode 100644
index def8a6bcf4..0000000000
--- a/examples/python/keras/func_cifar10_cnn_nested.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from flexflow.keras.models import Model, Sequential
-from flexflow.keras.layers import Input, Flatten, Dense, Activation, Conv2D, MaxPooling2D, Concatenate, concatenate
-import flexflow.keras.optimizers
-from flexflow.keras.datasets import mnist
-from flexflow.keras.datasets import cifar10
-from flexflow.keras import losses
-from flexflow.keras import metrics
-from flexflow.keras.callbacks import Callback, VerifyMetrics, EpochVerifyMetrics
-from accuracy import ModelAccuracy
-
-import flexflow.core as ff
-import numpy as np
-import argparse
-import gc
-  
-def top_level_task():
-  num_classes = 10
-  
-  num_samples = 10000
-  
-  (x_train, y_train), (x_test, y_test) = cifar10.load_data(num_samples)
-  
-  x_train = x_train.astype('float32')
-  x_train /= 255
-  y_train = y_train.astype('int32')
-  print("shape: ", x_train.shape)
-  
-  input_tensor1 = Input(shape=(3, 32, 32), dtype="float32")
-  output_tensor1 = Conv2D(filters=32, kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu")(input_tensor1)
-  output_tensor1 = Conv2D(filters=32, kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu")(output_tensor1)
-  output_tensor1 = MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="valid")(output_tensor1)
-  model1 = Model(input_tensor1, output_tensor1)
-  
-  input_tensor2 = Input(shape=(3, 32, 32), dtype="float32")
-  output_tensor2 = Conv2D(filters=64, kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu")(input_tensor2)
-  output_tensor2 = Conv2D(filters=64, kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu")(output_tensor2)
-  output_tensor2 = MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="valid")(output_tensor2)
-  output_tensor2 = Flatten()(output_tensor2)
-  output_tensor2 = Dense(512, activation="relu")(output_tensor2)
-  output_tensor2 = Dense(num_classes)(output_tensor2)
-  output_tensor2 = Activation("softmax")(output_tensor2)
-  model2 = Model(input_tensor2, output_tensor2)
-  
-  input_tensor3 = Input(shape=(3, 32, 32), dtype="float32")
-  output_tensor3 = model1(input_tensor3)
-  output_tensor3 = model2(output_tensor3)
-  model = Model(input_tensor3, output_tensor3)
-  
-  opt = flexflow.keras.optimizers.SGD(learning_rate=0.01)
-  model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])
-  print(model.summary())
-
-  model.fit(x_train, y_train, epochs=160, callbacks=[VerifyMetrics(ModelAccuracy.CIFAR10_CNN), EpochVerifyMetrics(ModelAccuracy.CIFAR10_CNN)])
-
-if __name__ == "__main__":
-  print("Functional API, cifar10 cnn nested")
-  top_level_task()
-  gc.collect()
diff --git a/examples/python/keras/func_cifar10_cnn_net2net.py b/examples/python/keras/func_cifar10_cnn_net2net.py
deleted file mode 100644
index 5434e28aca..0000000000
--- a/examples/python/keras/func_cifar10_cnn_net2net.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from flexflow.keras.models import Model, Sequential
-from flexflow.keras.layers import Input, Flatten, Dense, Activation, Conv2D, MaxPooling2D, Concatenate, concatenate
-import flexflow.keras.optimizers
-from flexflow.keras.datasets import mnist
-from flexflow.keras.datasets import cifar10
-from flexflow.keras import losses
-from flexflow.keras import metrics
-from flexflow.keras.callbacks import Callback, VerifyMetrics, EpochVerifyMetrics
-from accuracy import ModelAccuracy
-
-import flexflow.core as ff
-import numpy as np
-import argparse
-import gc
-
-def top_level_task():
-  num_classes = 10
-  
-  num_samples = 10000
-  
-  (x_train, y_train), (x_test, y_test) = cifar10.load_data(num_samples)
-  
-  x_train = x_train.astype('float32')
-  x_train /= 255
-  y_train = y_train.astype('int32')
-  print("shape: ", x_train.shape)
-  
-  #teacher
-  input_tensor1 = Input(shape=(3, 32, 32), dtype="float32")
-
-  c1 = Conv2D(filters=32, input_shape=(3,32,32), kernel_size=(3,3), strides=(1,1), padding="same", activation="relu")
-  c2 = Conv2D(filters=32, kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu")
-  c3 = Conv2D(filters=64, kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu")
-  c4 = Conv2D(filters=64, kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu")
-  d1 = Dense(512, activation="relu")
-  d2 = Dense(num_classes)
-
-  output_tensor = c1(input_tensor1)
-  output_tensor = c2(output_tensor)
-  output_tensor = MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="same")(output_tensor)
-  output_tensor = c3(output_tensor)
-  output_tensor = c4(output_tensor)
-  output_tensor = MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="valid")(output_tensor)
-  output_tensor = Flatten()(output_tensor)
-  output_tensor = d1(output_tensor)
-  output_tensor = d2(output_tensor)
-  output_tensor = Activation("softmax")(output_tensor)
-
-  teacher_model = Model(input_tensor1, output_tensor)
-
-  opt = flexflow.keras.optimizers.SGD(learning_rate=0.01)
-  teacher_model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])
-
-  teacher_model.fit(x_train, y_train, epochs=10)
-
-  c1_kernel, c1_bias = c1.get_weights(teacher_model.ffmodel)
-  c2_kernel, c2_bias = c2.get_weights(teacher_model.ffmodel)
-  c3_kernel, c3_bias = c3.get_weights(teacher_model.ffmodel)
-  c4_kernel, c4_bias = c4.get_weights(teacher_model.ffmodel)
-  d1_kernel, d1_bias = d1.get_weights(teacher_model.ffmodel)
-  d2_kernel, d2_bias = d2.get_weights(teacher_model.ffmodel)
-  #d2_kernel *= 0
-
-  c2_kernel_new = np.concatenate((c2_kernel, c2_kernel), axis=1)
-  print(c2_kernel.shape, c2_kernel_new.shape, c2_bias.shape)
-  
-  #student model
-  input_tensor2 = Input(shape=(3, 32, 32), dtype="float32")
-
-  sc1_1 = Conv2D(filters=32, input_shape=(3,32,32), kernel_size=(3,3), strides=(1,1), padding="same", activation="relu")
-  sc1_2 = Conv2D(filters=32, input_shape=(3,32,32), kernel_size=(3,3), strides=(1,1), padding="same", activation="relu")
-  sc2 = Conv2D(filters=32, kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu")
-  sc3 = Conv2D(filters=64, kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu")
-  sc4 = Conv2D(filters=64, kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu")
-  sd1 = Dense(512, activation="relu")
-  sd2 = Dense(num_classes)
-
-  t1 = sc1_1(input_tensor2)
-  t2 = sc1_2(input_tensor2)
-  output_tensor = Concatenate(axis=1)([t1, t2])
-  output_tensor = sc2(output_tensor)
-  output_tensor = MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="same")(output_tensor)
-  output_tensor = sc3(output_tensor)
-  output_tensor = sc4(output_tensor)
-  output_tensor = MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="valid")(output_tensor)
-  output_tensor = Flatten()(output_tensor)
-  output_tensor = sd1(output_tensor)
-  output_tensor = sd2(output_tensor)
-  output_tensor = Activation("softmax")(output_tensor)
-
-  student_model = Model(input_tensor2, output_tensor)
-
-  opt = flexflow.keras.optimizers.SGD(learning_rate=0.01)
-  student_model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])
-
-  sc1_1.set_weights(student_model.ffmodel, c1_kernel, c1_bias)
-  sc1_2.set_weights(student_model.ffmodel, c1_kernel, c1_bias)
-  sc2.set_weights(student_model.ffmodel, c2_kernel_new, c2_bias)
-  sc3.set_weights(student_model.ffmodel, c3_kernel, c3_bias)
-  sc4.set_weights(student_model.ffmodel, c4_kernel, c4_bias)
-  sd1.set_weights(student_model.ffmodel, d1_kernel, d1_bias)
-  sd2.set_weights(student_model.ffmodel, d2_kernel, d2_bias)
-
-  student_model.fit(x_train, y_train, epochs=160, callbacks=[VerifyMetrics(ModelAccuracy.CIFAR10_CNN), EpochVerifyMetrics(ModelAccuracy.CIFAR10_CNN)])
-
-if __name__ == "__main__":
-  print("Functional API, cifarf10 cnn teach student")
-  top_level_task()
-  gc.collect()
diff --git a/examples/python/keras/func_mnist_cnn.py b/examples/python/keras/func_mnist_cnn.py
deleted file mode 100644
index a81ddd0f94..0000000000
--- a/examples/python/keras/func_mnist_cnn.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from flexflow.keras.models import Model, Sequential
-from flexflow.keras.layers import Input, Flatten, Dense, Activation, Conv2D, MaxPooling2D, Concatenate, concatenate, Dropout
-import flexflow.keras.optimizers
-from flexflow.keras.datasets import mnist
-from flexflow.keras.datasets import cifar10
-from flexflow.keras import losses
-from flexflow.keras import metrics
-from flexflow.keras.callbacks import Callback, VerifyMetrics, EpochVerifyMetrics
-from accuracy import ModelAccuracy
-from flexflow.keras.initializers import GlorotUniform, Zeros
-
-import flexflow.core as ff
-import numpy as np
-import argparse
-import gc
-  
-def top_level_task():
-  num_classes = 10
-
-  img_rows, img_cols = 28, 28
-  
-  (x_train, y_train), (x_test, y_test) = mnist.load_data()
-  x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
-  
-  x_train = x_train.astype('float32')
-  x_train /= 255
-  y_train = y_train.astype('int32')
-  y_train = np.reshape(y_train, (len(y_train), 1))
-  
-  input_tensor = Input(shape=(1, 28, 28), dtype="float32")
-  
-  output = Conv2D(filters=32, input_shape=(1,28,28), kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu")(input_tensor)
-#  output = Conv2D(filters=64, kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu")(output)
-  output = Conv2D(filters=64, kernel_size=(3,3), strides=(1,1), padding=(1,1), kernel_initializer=GlorotUniform(123), bias_initializer=Zeros())(output)
-  output = Activation('relu')(output)
-  output = MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="valid")(output)
-  output = Dropout(0.25)(output)
-  output = Flatten()(output)
-  output = Dense(128, activation="relu")(output)
-  output = Dense(num_classes)(output)
-  output = Activation("softmax")(output)
-
-  model = Model(input_tensor, output)
-  
-  opt = flexflow.keras.optimizers.SGD(learning_rate=0.01)
-  model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])
-  
-  print(model.summary())
-  
-  flatten1 = model.get_layer(name='flat')
-  t1 = flatten1.output_tensors[0]
-  t2 = flatten1.input_tensors[0]
-  print("t1: ", t1.to_layers, " ", t1.from_layer)
-  print("t2: ", t2.to_layers, " ", t2.from_layer)
-  
-  model.fit(x_train, y_train, epochs=5, callbacks=[VerifyMetrics(ModelAccuracy.MNIST_CNN), EpochVerifyMetrics(ModelAccuracy.MNIST_CNN)])
-
-if __name__ == "__main__":
-  print("Functional API, mnist cnn")
-  top_level_task()
-  gc.collect()
diff --git a/examples/python/keras/func_mnist_cnn_concat.py b/examples/python/keras/func_mnist_cnn_concat.py
deleted file mode 100644
index 54c1f32d36..0000000000
--- a/examples/python/keras/func_mnist_cnn_concat.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from flexflow.keras.models import Model, Sequential
-from flexflow.keras.layers import Input, Flatten, Dense, Activation, Conv2D, MaxPooling2D, Concatenate, concatenate
-import flexflow.keras.optimizers
-from flexflow.keras.datasets import mnist
-from flexflow.keras.datasets import cifar10
-from flexflow.keras import losses
-from flexflow.keras import metrics
-from flexflow.keras.callbacks import Callback, VerifyMetrics, EpochVerifyMetrics
-from accuracy import ModelAccuracy
-
-import flexflow.core as ff
-import numpy as np
-import argparse
-import gc
-  
-def top_level_task():
-  num_classes = 10
-
-  img_rows, img_cols = 28, 28
-  
-  (x_train, y_train), (x_test, y_test) = mnist.load_data()
-  x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
-  
-  x_train = x_train.astype('float32')
-  x_train /= 255
-  y_train = y_train.astype('int32')
-  y_train = np.reshape(y_train, (len(y_train), 1))
-  
-  input_tensor = Input(shape=(1, 28, 28), dtype="float32")
-  
-  t1 = Conv2D(filters=32, input_shape=(1,28,28), kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu")(input_tensor)
-  t2 = Conv2D(filters=32, input_shape=(1,28,28), kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu")(input_tensor)
-  output = concatenate([t1, t2])
-  output = Conv2D(filters=64, kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu")(output)
-  output = MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="valid")(output)
-  output = Flatten()(output)
-  output = Dense(128, activation="relu")(output)
-  output = Dense(num_classes)(output)
-  output = Activation("softmax")(output)
-
-  model = Model(input_tensor, output)
-  
-  opt = flexflow.keras.optimizers.SGD(learning_rate=0.01)
-  model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])
-  print(model.summary())
-  
-  model.fit(x_train, y_train, epochs=5, callbacks=[VerifyMetrics(ModelAccuracy.MNIST_CNN), EpochVerifyMetrics(ModelAccuracy.MNIST_CNN)])
-
-if __name__ == "__main__":
-  print("Functional API, mnist cnn concat")
-  top_level_task()
-  gc.collect()
diff --git a/examples/python/keras/func_mnist_mlp.py b/examples/python/keras/func_mnist_mlp.py
deleted file mode 100644
index 5521f193c1..0000000000
--- a/examples/python/keras/func_mnist_mlp.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from flexflow.keras.models import Model, Sequential
-from flexflow.keras.layers import Input, Flatten, Dense, Activation, Conv2D, MaxPooling2D, Concatenate, concatenate
-import flexflow.keras.optimizers
-from flexflow.keras.datasets import mnist
-from flexflow.keras.datasets import cifar10
-from flexflow.keras import losses
-from flexflow.keras import metrics
-from flexflow.keras.callbacks import Callback, VerifyMetrics, EpochVerifyMetrics
-from accuracy import ModelAccuracy
-
-import flexflow.core as ff
-import numpy as np
-import argparse
-import gc
-  
-def top_level_task():
-  num_classes = 10
-
-  (x_train, y_train), (x_test, y_test) = mnist.load_data()
-
-  x_train = x_train.reshape(60000, 784)
-  x_train = x_train.astype('float32')
-  x_train /= 255
-  y_train = y_train.astype('int32')
-  y_train = np.reshape(y_train, (len(y_train), 1))
-  print("shape: ", x_train.shape)
-  
-  input_tensor = Input(shape=(784,))
-  
-  output = Dense(512, input_shape=(784,), activation="relu")(input_tensor)
-  output2 = Dense(512, activation="relu")(output)
-  output3 = Dense(num_classes)(output2)
-  output4 = Activation("softmax")(output3)
-  
-  model = Model(input_tensor, output4)
-
-  opt = flexflow.keras.optimizers.SGD(learning_rate=0.01)
-  model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy', metrics.SparseCategoricalCrossentropy()])
-
-  model.fit(x_train, y_train, epochs=10, callbacks=[VerifyMetrics(ModelAccuracy.MNIST_MLP), EpochVerifyMetrics(ModelAccuracy.MNIST_MLP)])
-
-if __name__ == "__main__":
-  print("Functional API, mnist mlp")
-  top_level_task()
-  gc.collect()
diff --git a/examples/python/keras/func_mnist_mlp_concat.py b/examples/python/keras/func_mnist_mlp_concat.py
deleted file mode 100644
index 29b982cea8..0000000000
--- a/examples/python/keras/func_mnist_mlp_concat.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from flexflow.keras.models import Model, Sequential
-from flexflow.keras.layers import Input, Flatten, Dense, Activation, Conv2D, MaxPooling2D, Concatenate, concatenate
-import flexflow.keras.optimizers
-from flexflow.keras.datasets import mnist
-from flexflow.keras.datasets import cifar10
-from flexflow.keras import losses
-from flexflow.keras import metrics
-from flexflow.keras.callbacks import Callback, VerifyMetrics, EpochVerifyMetrics
-from accuracy import ModelAccuracy
-
-import flexflow.core as ff
-import numpy as np
-import argparse
-import gc
-  
-def top_level_task():
-  num_classes = 10
-
-  (x_train, y_train), (x_test, y_test) = mnist.load_data()
-
-  x_train = x_train.reshape(60000, 784)
-  x_train = x_train.astype('float32')
-  x_train /= 255
-  y_train = y_train.astype('int32')
-  y_train = np.reshape(y_train, (len(y_train), 1))
-  print("shape: ", x_train.shape)
-  
-  input_tensor1 = Input(shape=(256,))
-  input_tensor2 = Input(shape=(256,))
-  input_tensor3 = Input(shape=(256,))
-  input_tensor4 = Input(shape=(256,))
-  
-  t1 = Dense(512, activation="relu", name="dense1")(input_tensor1)
-  t1 = Dense(512, activation="relu", name="dense12")(t1)
-  model1 = Model(input_tensor1, t1)
-  t2 = Dense(512, activation="relu", name="dense2")(input_tensor2)
-  t2 = Dense(512, activation="relu", name="dense22")(t2)
-  model2 = Model(input_tensor2, t2)
-  t3 = Dense(512, activation="relu", name="dense3")(input_tensor3)
-  t3 = Dense(512, activation="relu", name="dense33")(t3)
-  model3 = Model(input_tensor3, t3)
-  t4 = Dense(512, activation="relu", name="dense4")(input_tensor4)
-  t4 = Dense(512, activation="relu", name="dense44")(t4)
-  model4 = Model(input_tensor4, t4)
-  
-  input_tensor = Input(shape=(784,))
-  t1 = model1(input_tensor)
-  t2 = model2(input_tensor)
-  t3 = model3(input_tensor)
-  t4 = model4(input_tensor)
-  output = Concatenate(axis=1)([t1, t2, t3, t4])
-  output = Dense(num_classes)(output)
-  output = Activation("softmax")(output)
-  
-  model = Model(input_tensor, output)
-
-  opt = flexflow.keras.optimizers.SGD(learning_rate=0.01)
-  model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])
-  
-  print(model.summary())
-
-  model.fit(x_train, y_train, epochs=5, callbacks=[VerifyMetrics(ModelAccuracy.MNIST_MLP), EpochVerifyMetrics(ModelAccuracy.MNIST_MLP)])
-
-if __name__ == "__main__":
-  print("Functional API, mnist mlp concat")
-  top_level_task()
-  gc.collect()
diff --git a/examples/python/keras/func_mnist_mlp_concat2.py b/examples/python/keras/func_mnist_mlp_concat2.py
deleted file mode 100644
index 5a35bd9f8b..0000000000
--- a/examples/python/keras/func_mnist_mlp_concat2.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from flexflow.keras.models import Model, Sequential
-from flexflow.keras.layers import Input, Flatten, Dense, Activation, Conv2D, MaxPooling2D, Concatenate, concatenate
-import flexflow.keras.optimizers
-from flexflow.keras.datasets import mnist
-from flexflow.keras.datasets import cifar10
-from flexflow.keras import losses
-from flexflow.keras import metrics
-from flexflow.keras.callbacks import Callback, VerifyMetrics, EpochVerifyMetrics
-from accuracy import ModelAccuracy
-
-
-import flexflow.core as ff
-import numpy as np
-import argparse
-import gc
-  
-def top_level_task():
-  num_classes = 10
-
-  (x_train, y_train), (x_test, y_test) = mnist.load_data()
-
-  x_train = x_train.reshape(60000, 784)
-  x_train = x_train.astype('float32')
-  x_train /= 255
-  y_train = y_train.astype('int32')
-  y_train = np.reshape(y_train, (len(y_train), 1))
-  print("shape: ", x_train.shape)
-  
-  # input_tensor1 = Input(shape=(256,))
-  input_tensor11 = Input(shape=(256,))
-  input_tensor12 = Input(shape=(256,))
-  input_tensor2 = Input(shape=(256,))
-  input_tensor3 = Input(shape=(256,))
-  input_tensor4 = Input(shape=(256,))
-  
-  # t1 = Dense(512, activation="relu", name="dense1")(input_tensor1)
-  # t1 = Dense(512, activation="relu", name="dense12")(t1)
-  # model1 = Model(input_tensor1, t1)
-  t11 = Dense(512, activation="relu", name="dense1")(input_tensor11)
-  model11 = Model(input_tensor11, t11)
-  t12 = model11(input_tensor12)
-  t1 = Dense(512, activation="relu", name="dense12")(t12)
-  model1 = Model(input_tensor12, t1)
-  
-  t2 = Dense(512, activation="relu", name="dense2")(input_tensor2)
-  t2 = Dense(512, activation="relu", name="dense22")(t2)
-  model2 = Model(input_tensor2, t2)
-  t3 = Dense(512, activation="relu", name="dense3")(input_tensor3)
-  t3 = Dense(512, activation="relu", name="dense33")(t3)
-  model3 = Model(input_tensor3, t3)
-  t4 = Dense(512, activation="relu", name="dense4")(input_tensor4)
-  t4 = Dense(512, activation="relu", name="dense44")(t4)
-  model4 = Model(input_tensor4, t4)
-  
-  input_tensor = Input(shape=(784,))
-  t00 = Input(shape=(784,), name="input_00")
-  t01 = Input(shape=(784,), name="input_01")
-  t1 = model1(input_tensor)
-  t2 = model2(input_tensor)
-  t3 = model3(input_tensor)
-  t4 = model4(input_tensor)
-  output = Concatenate(axis=1)([t00, t01, t1, t2, t3, t4])
-  output = Dense(num_classes)(output)
-  output = Activation("softmax")(output)
-  
-  model = Model([t00, t01, input_tensor], output)
-
-  opt = flexflow.keras.optimizers.SGD(learning_rate=0.01)
-  model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])
-  
-  print(model.summary())
-
-  model.fit([x_train, x_train, x_train], y_train, epochs=10, callbacks=[VerifyMetrics(ModelAccuracy.MNIST_MLP), EpochVerifyMetrics(ModelAccuracy.MNIST_MLP)])
-
-if __name__ == "__main__":
-  print("Functional API, mnist mlp concat with input")
-  top_level_task()
-  gc.collect()
diff --git a/examples/python/keras/func_mnist_mlp_net2net.py b/examples/python/keras/func_mnist_mlp_net2net.py
deleted file mode 100644
index ed8589e22e..0000000000
--- a/examples/python/keras/func_mnist_mlp_net2net.py
+++ /dev/null
@@ -1,94 +0,0 @@
-# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from flexflow.keras.models import Model, Sequential
-from flexflow.keras.layers import Input, Flatten, Dense, Activation, Conv2D, MaxPooling2D, Concatenate, concatenate
-import flexflow.keras.optimizers
-from flexflow.keras.datasets import mnist
-from flexflow.keras.datasets import cifar10
-from flexflow.keras import losses
-from flexflow.keras import metrics
-from flexflow.keras.callbacks import Callback, VerifyMetrics, EpochVerifyMetrics
-from accuracy import ModelAccuracy
-
-import flexflow.core as ff
-import numpy as np
-import argparse
-import gc
-  
-def top_level_task():
-  num_classes = 10
-
-  (x_train, y_train), (x_test, y_test) = mnist.load_data()
-
-  x_train = x_train.reshape(60000, 784)
-  x_train = x_train.astype('float32')
-  x_train /= 255
-  y_train = y_train.astype('int32')
-  y_train = np.reshape(y_train, (len(y_train), 1))
-  print("shape: ", x_train.shape)
-  
-  #teacher
-  
-  input_tensor1 = Input(shape=(784,), dtype="float32")
-  
-  d1 = Dense(512, input_shape=(784,), activation="relu")
-  d2 = Dense(512, activation="relu")
-  d3 = Dense(num_classes)
-  
-  output = d1(input_tensor1)
-  output = d2(output)
-  output = d3(output)
-  output = Activation("softmax")(output)
-  
-  teacher_model = Model(input_tensor1, output)
-
-  opt = flexflow.keras.optimizers.SGD(learning_rate=0.01)
-  teacher_model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])
-
-  teacher_model.fit(x_train, y_train, epochs=10)
-  
-  d1_kernel, d1_bias = d1.get_weights(teacher_model.ffmodel)
-  d2_kernel, d2_bias = d2.get_weights(teacher_model.ffmodel)
-  d3_kernel, d3_bias = d3.get_weights(teacher_model.ffmodel)
-  
-  # student
-  
-  input_tensor2 = Input(shape=(784,), dtype="float32")
-  
-  sd1_1 = Dense(512, input_shape=(784,), activation="relu")
-  sd2 = Dense(512, activation="relu")
-  sd3 = Dense(num_classes)
-  
-  output = sd1_1(input_tensor2)
-  output = sd2(output)
-  output = sd3(output)
-  output = Activation("softmax")(output)
-
-  student_model = Model(input_tensor2, output)
-
-  opt = flexflow.keras.optimizers.SGD(learning_rate=0.01)
-  student_model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])
-  
-  sd1_1.set_weights(student_model.ffmodel, d1_kernel, d1_bias)
-  sd2.set_weights(student_model.ffmodel, d2_kernel, d2_bias)
-  sd3.set_weights(student_model.ffmodel, d3_kernel, d3_bias)
-
-  student_model.fit(x_train, y_train, epochs=160, callbacks=[VerifyMetrics(ModelAccuracy.MNIST_MLP), EpochVerifyMetrics(ModelAccuracy.MNIST_MLP)])
-
-if __name__ == "__main__":
-  print("Functional API, mnist mlp teach student")
-  top_level_task()
-  gc.collect()
\ No newline at end of file
diff --git a/examples/python/keras/gather.py b/examples/python/keras/gather.py
deleted file mode 100644
index 15ccd61579..0000000000
--- a/examples/python/keras/gather.py
+++ /dev/null
@@ -1,45 +0,0 @@
-from flexflow.keras.layers import Dense, Input, Reshape
-from flexflow.keras.backend.internal import gather
-import flexflow.keras.optimizers
-
-import numpy as np
-
-
-def get_modified_idx(idx, hidden_shape):
-    return idx.reshape(-1, 1).repeat(hidden_shape, 1).astype(np.int32)
-
-
-def gather_example():
-    h = 3
-    idx = np.array([[5, 7, 10], [8, 4, 0]])
-    # Convert idx to that required by torch.gather
-    idx = get_modified_idx(idx, h)  # 6,3
-
-    input0 = Input(shape=(10,), dtype="float32")
-    input1 = Input(shape=idx.shape, dtype="int32")
-
-    x0 = Dense(60, activation='relu')(input0)  # B,60
-    x0 = Reshape((20, h))(x0)  # B,20,3
-    f0 = gather(x0, input1, axis=1)  # B,6,3
-    f0 = Reshape((18,))(f0)
-
-    out = Dense(1)(f0)  # B,1
-
-    model = flexflow.keras.models.Model([input0, input1], out)
-
-    opt = flexflow.keras.optimizers.Adam(learning_rate=0.001)
-    model.compile(
-      optimizer=opt, loss='mean_squared_error', metrics=['mean_squared_error'])
-    print(model.summary())
-    model.fit(
-        x=[
-            np.random.randn(300, 10).astype(np.float32),
-            idx[None, ...].repeat(300, 0).astype(np.int32)
-        ],
-        y=np.random.randn(300, 1).astype(np.float32),
-        epochs=2
-    )
-
-
-if __name__ == '__main__':
-    gather_example()
diff --git a/examples/python/keras/identity_loss.py b/examples/python/keras/identity_loss.py
deleted file mode 100644
index d0396c6d46..0000000000
--- a/examples/python/keras/identity_loss.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from flexflow.keras.layers import Dense, Input, Reshape, Multiply
-import flexflow.keras.optimizers
-
-import numpy as np
-
-def test_identity_loss():
-  input0 = Input(shape=(32,), dtype="float32")
-  x0 = Dense(20, activation='relu')(input0) # B,20
-  out = flexflow.keras.backend.sum(x0, axis=1) # B
-
-  model = flexflow.keras.models.Model(input0, out)
-
-  opt = flexflow.keras.optimizers.Adam(learning_rate=0.01)
-  model.compile(optimizer=opt, loss='identity', metrics=['mean_absolute_error'])
-  print(model.summary())
-  model.fit(
-    x = np.random.randn(300, 32).astype(np.float32),
-    y = np.zeros((300)).astype(np.float32),
-    epochs = 2
-  )
-
-
-if __name__ == "__main__":
-    test_identity_loss()
diff --git a/examples/python/keras/reduce_sum.py b/examples/python/keras/reduce_sum.py
deleted file mode 100644
index 3857738d4b..0000000000
--- a/examples/python/keras/reduce_sum.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from flexflow.keras.layers import Dense, Input, Reshape, Multiply
-import flexflow.keras.optimizers
-
-import numpy as np
-
-def test_reduce_sum1():
-  input0 = Input(shape=(32,), dtype="float32")
-
-  x0 = Dense(20, activation='relu')(input0) # B,20
-  nx0 = Reshape((10,2))(x0) # B,10,2
-  out = flexflow.keras.backend.sum(nx0, axis=1) # B,2
-
-  model = flexflow.keras.models.Model(input0, out)
-
-  opt = flexflow.keras.optimizers.Adam(learning_rate=0.001)
-  model.compile(optimizer=opt, loss='mean_squared_error', metrics=['mean_squared_error'])
-  print(model.summary())
-  model.fit(
-    x = np.random.randn(300, 32).astype(np.float32),
-    y = np.random.randn(300, 2).astype(np.float32),
-    epochs = 2
-  )
-
-def test_reduce_sum2():
-  input0 = Input(shape=(32,), dtype="float32")
-
-  x0 = Dense(20, activation='relu')(input0) # B,20
-  nx0 = Reshape((10,2))(x0) # B,10,2
-  out = flexflow.keras.backend.sum(nx0, axis=[1,2]) # B
-
-  model = flexflow.keras.models.Model(input0, out)
-
-  opt = flexflow.keras.optimizers.Adam(learning_rate=0.001)
-  model.compile(optimizer=opt, loss='mean_squared_error', metrics=['mean_squared_error'])
-  print(model.summary())
-  model.fit(
-    x = np.random.randn(300, 32).astype(np.float32),
-    y = np.random.randn(300).astype(np.float32),
-    epochs = 2
-  )
-
-def test_reduce_sum3():
-  input0 = Input(shape=(32,), dtype="float32")
-
-  x0 = Dense(20, activation='relu')(input0) # B,20
-  nx0 = Reshape((10,2))(x0) # B,10,2
-  out = flexflow.keras.backend.sum(nx0, axis=[1,2], keepdims=True) # B,1,1
-
-  model = flexflow.keras.models.Model(input0, out)
-
-  opt = flexflow.keras.optimizers.Adam(learning_rate=0.001)
-  model.compile(optimizer=opt, loss='mean_squared_error', metrics=['mean_squared_error'])
-  print(model.summary())
-  model.fit(
-    x = np.random.randn(300, 32).astype(np.float32),
-    y = np.random.randn(300, 1, 1).astype(np.float32),
-    epochs = 2
-  )
-
-
-if __name__ == "__main__":
-  test_reduce_sum1()
-  test_reduce_sum2()
-  test_reduce_sum3()
diff --git a/examples/python/keras/regularizer.py b/examples/python/keras/regularizer.py
deleted file mode 100644
index 3b1e30d04d..0000000000
--- a/examples/python/keras/regularizer.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import flexflow.keras as keras
-from flexflow.keras.layers import Dense, Input, Reshape
-from flexflow.keras.backend.internal import gather
-import flexflow.keras.optimizers
-
-import numpy as np
-
-
-def regularizer_example():
-    input0 = Input(shape=(10,), dtype="float32")
-
-    reg = keras.regularizers.L2(0.001)
-    x0 = Dense(16, activation='relu', kernel_regularizer=reg)(input0)
-    out = Dense(1)(x0)
-
-    model = flexflow.keras.models.Model(input0, out)
-
-    opt = flexflow.keras.optimizers.Adam(learning_rate=0.001)
-    model.compile(
-      optimizer=opt, loss='mean_squared_error', metrics=['mean_squared_error'])
-    model.fit(
-        x=np.random.randn(300, 10).astype(np.float32),
-        y=np.random.randn(300, 1).astype(np.float32),
-        epochs=2
-    )
-
-
-if __name__ == '__main__':
-    regularizer_example()
diff --git a/examples/python/keras/reshape.py b/examples/python/keras/reshape.py
deleted file mode 100644
index 1acce1b2b6..0000000000
--- a/examples/python/keras/reshape.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from flexflow.keras.models import Model, Sequential
-from flexflow.keras.layers import Input, Flatten, Dense, Activation, Conv2D, MaxPooling2D, Concatenate, concatenate, Reshape
-import flexflow.keras.optimizers
-from flexflow.keras.datasets import mnist
-from flexflow.keras.datasets import cifar10
-from flexflow.keras import losses
-from flexflow.keras import metrics
-from flexflow.keras.callbacks import Callback, VerifyMetrics, EpochVerifyMetrics
-from accuracy import ModelAccuracy
-
-import flexflow.core as ff
-import numpy as np
-import argparse
-import gc
-  
-def top_level_task():
-  num_classes = 10
-
-  (x_train, y_train), (x_test, y_test) = mnist.load_data()
-
-  x_train = x_train.reshape(60000, 784)
-  x_train = x_train.astype('float32')
-  x_train /= 255
-  y_train = y_train.astype('int32')
-  y_train = np.reshape(y_train, (len(y_train), 1))
-  print("shape: ", x_train.shape)
-  
-  input_tensor = Input(shape=(784,))
-  output = Reshape(target_shape=(28, 28))(input_tensor)
-  output = Reshape(target_shape=(784,))(output)
-  output = Dense(512, input_shape=(784,), activation="relu")(output)
-  output2 = Dense(512, activation="relu")(output)
-  output3 = Dense(num_classes)(output2)
-  output4 = Activation("softmax")(output3)
-  
-  model = Model(input_tensor, output4)
-
-  opt = flexflow.keras.optimizers.SGD(learning_rate=0.01)
-  model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy', metrics.SparseCategoricalCrossentropy()])
-  print(model.summary())
-  model.fit(x_train, y_train, epochs=10, callbacks=[VerifyMetrics(ModelAccuracy.MNIST_MLP), EpochVerifyMetrics(ModelAccuracy.MNIST_MLP)])
-
-if __name__ == "__main__":
-  print("Functional API, mnist mlp")
-  top_level_task()
-  gc.collect()
diff --git a/examples/python/keras/rsqrt.py b/examples/python/keras/rsqrt.py
deleted file mode 100644
index be55c8a1fd..0000000000
--- a/examples/python/keras/rsqrt.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from flexflow.keras.layers import Dense, Input
-from flexflow.keras.backend.internal import rsqrt
-import flexflow.keras.optimizers
-
-import numpy as np
-
-def test_rsqrt():
-  inp1 = Input(shape=(32,), dtype="float32")
-  inp2 = Input(shape=(20,), dtype="float32")
-
-  x = Dense(20, activation='relu')(inp1)
-  out = rsqrt(x + inp2)
-
-  model = flexflow.keras.models.Model([inp1, inp2], out)
-
-  opt = flexflow.keras.optimizers.Adam(learning_rate=0.001)
-  model.compile(optimizer=opt, loss='mean_squared_error', metrics=['mean_squared_error'])
-  print(model.summary())
-  model.fit(
-    x = [np.random.randn(300, 32).astype(np.float32),
-         np.ones((300, 20)).astype(np.float32)],
-    y = np.random.randn(300, 20).astype(np.float32),
-    epochs = 2
-  )
-
-
-if __name__ == "__main__":
-    test_rsqrt()
diff --git a/examples/python/keras/seq_cifar10_cnn.py b/examples/python/keras/seq_cifar10_cnn.py
deleted file mode 100644
index 80f4390d4c..0000000000
--- a/examples/python/keras/seq_cifar10_cnn.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from flexflow.keras.models import Sequential
-from flexflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Activation
-import flexflow.keras.optimizers
-from flexflow.keras.datasets import cifar10
-from flexflow.keras.callbacks import Callback, VerifyMetrics, EpochVerifyMetrics
-
-import flexflow.core as ff
-import numpy as np
-from accuracy import ModelAccuracy
-
-def top_level_task():
-  
-  num_classes = 10
-  
-  num_samples = 10000
-  
-  (x_train, y_train), (x_test, y_test) = cifar10.load_data(num_samples)
-  
-  x_train = x_train.astype('float32')
-  x_train /= 255
-  y_train = y_train.astype('int32')
-  print("shape: ", x_train.shape)
-  
-  model = Sequential()
-  model.add(Conv2D(filters=32, input_shape=(3,32,32), kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu"))
-  model.add(Conv2D(filters=32, kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu"))
-  model.add(MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="valid"))
-  model.add(Conv2D(filters=64, kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu"))
-  model.add(Conv2D(filters=64, kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu"))
-  model.add(MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="valid"))
-  model.add(Flatten())
-  model.add(Dense(512, activation="relu"))
-  model.add(Dense(num_classes))
-  model.add(Activation("softmax"))
-
-  opt = flexflow.keras.optimizers.SGD(learning_rate=0.02)
-  model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])
-  print(model.summary())
-
-  model.fit(x_train, y_train, epochs=80, callbacks=[VerifyMetrics(ModelAccuracy.CIFAR10_CNN), EpochVerifyMetrics(ModelAccuracy.CIFAR10_CNN)])
-
-if __name__ == "__main__":
-  print("Sequantial model, cifar10 cnn")
-  top_level_task()
diff --git a/examples/python/keras/seq_mnist_cnn.py b/examples/python/keras/seq_mnist_cnn.py
deleted file mode 100644
index eaf0fdfc16..0000000000
--- a/examples/python/keras/seq_mnist_cnn.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from flexflow.keras.models import Sequential
-from flexflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Activation, Input
-import flexflow.keras.optimizers
-from flexflow.keras.datasets import mnist
-from flexflow.keras.callbacks import Callback, VerifyMetrics, EpochVerifyMetrics
-
-import flexflow.core as ff
-import numpy as np
-from accuracy import ModelAccuracy
-
-def top_level_task():
-  
-  num_classes = 10
-
-  img_rows, img_cols = 28, 28
-  
-  (x_train, y_train), (x_test, y_test) = mnist.load_data()
-  x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
-  
-  x_train = x_train.astype('float32')
-  x_train /= 255
-  y_train = y_train.astype('int32')
-  y_train = np.reshape(y_train, (len(y_train), 1))
-  print("shape: ", x_train.shape, x_train.__array_interface__["strides"])
-  
-  layers = [Input(shape=(1, 28, 28), dtype="float32"),
-            Conv2D(filters=32, kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu"),
-            Conv2D(filters=64, kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu"),
-            MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="valid"),
-            Flatten(),
-            Dense(128, activation="relu"),
-            Dense(num_classes),
-            Activation("softmax")]
-  model = Sequential(layers)
-
-  opt = flexflow.keras.optimizers.SGD(learning_rate=0.01)
-  model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])
-  
-  print(model.summary())
-
-  model.fit(x_train, y_train, epochs=5, callbacks=[VerifyMetrics(ModelAccuracy.MNIST_CNN), EpochVerifyMetrics(ModelAccuracy.MNIST_CNN)])
-
-if __name__ == "__main__":
-  print("Sequential model, mnist cnn")
-  top_level_task()
diff --git a/examples/python/keras/seq_mnist_cnn_nested.py b/examples/python/keras/seq_mnist_cnn_nested.py
deleted file mode 100644
index 2c92349cd6..0000000000
--- a/examples/python/keras/seq_mnist_cnn_nested.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from flexflow.keras.models import Sequential, Model
-from flexflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Activation, AveragePooling2D, Input
-import flexflow.keras.optimizers
-from flexflow.keras.datasets import mnist
-from flexflow.keras.callbacks import Callback, VerifyMetrics, EpochVerifyMetrics
-
-import flexflow.core as ff
-import numpy as np
-from accuracy import ModelAccuracy
-
-def top_level_task():
-  
-  num_classes = 10
-
-  img_rows, img_cols = 28, 28
-  
-  (x_train, y_train), (x_test, y_test) = mnist.load_data()
-  x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
-  
-  x_train = x_train.astype('float32')
-  x_train /= 255
-  y_train = y_train.astype('int32')
-  y_train = np.reshape(y_train, (len(y_train), 1))
-  print("shape: ", x_train.shape, x_train.__array_interface__["strides"])
-  
-  layers = [Conv2D(filters=32, input_shape=(1,28,28), kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu"),
-            Conv2D(filters=64, kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu"),
-            MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="valid"),
-            Flatten()]
-  model1 = Sequential(layers)
-  
-  input_tensor = Input(shape=(12544,), dtype="float32")
-  
-  output = Dense(512, input_shape=(12544,), activation="relu")(input_tensor)
-  output = Dense(num_classes)(output)
-  output = Activation("softmax")(output)
-  
-  model2 = Model(input_tensor, output)
-  
-  model = Sequential()
-  model.add(model1)
-  model.add(model2)
-  
-  print(model.summary())
-
-  opt = flexflow.keras.optimizers.SGD(learning_rate=0.01)
-  model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])
-  
-  print(model.summary())
-
-  model.fit(x_train, y_train, epochs=5, callbacks=[VerifyMetrics(ModelAccuracy.MNIST_CNN), EpochVerifyMetrics(ModelAccuracy.MNIST_CNN)])
-
-if __name__ == "__main__":
-  print("Sequential model, mnist cnn nested model")
-  top_level_task()
diff --git a/examples/python/keras/seq_mnist_cnn_net2net.py b/examples/python/keras/seq_mnist_cnn_net2net.py
deleted file mode 100644
index 4b9c9c16ba..0000000000
--- a/examples/python/keras/seq_mnist_cnn_net2net.py
+++ /dev/null
@@ -1,103 +0,0 @@
-# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from flexflow.keras.models import Sequential
-from flexflow.keras.layers import Flatten, Dense, Activation, Conv2D, MaxPooling2D
-import flexflow.keras.optimizers
-from flexflow.keras.datasets import mnist
-from flexflow.keras.callbacks import Callback, VerifyMetrics, EpochVerifyMetrics
-
-import flexflow.core as ff
-import numpy as np
-from accuracy import ModelAccuracy
-  
-def create_teacher_model_cnn(num_classes, x_train, y_train):
-  model = Sequential()
-  model.add(Conv2D(filters=32, input_shape=(1,28,28), kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu"))
-  model.add(Conv2D(filters=64, kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu"))
-  model.add(MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="valid"))
-  model.add(Flatten())
-  model.add(Dense(128, activation="relu"))
-  model.add(Dense(num_classes))
-  model.add(Activation("softmax"))
-
-  opt = flexflow.keras.optimizers.SGD(learning_rate=0.01)
-  model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])
-  
-  print(model.summary())
-
-  model.fit(x_train, y_train, epochs=5)
-  return model
-  
-def create_student_model_cnn(teacher_model, num_classes, x_train, y_train):
-  conv1 = teacher_model.get_layer(index=0)
-  c1_kernel, c1_bias = conv1.get_weights(teacher_model.ffmodel)
-  print(c1_kernel.shape, c1_bias.shape)
-
-  conv2 = teacher_model.get_layer(index=1)
-  c2_kernel, c2_bias = conv2.get_weights(teacher_model.ffmodel)
-  
-  dense1 = teacher_model.get_layer(index=4)
-  d1_kernel, d1_bias = dense1.get_weights(teacher_model.ffmodel)
-  
-  dense2 = teacher_model.get_layer(index=5)
-  d2_kernel, d2_bias = dense2.get_weights(teacher_model.ffmodel)
-  
-  model = Sequential()
-  model.add(Conv2D(filters=32, input_shape=(1,28,28), kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu"))
-  model.add(Conv2D(filters=64, kernel_size=(3,3), strides=(1,1), padding=(1,1), activation="relu"))
-  model.add(MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="valid"))
-  model.add(Flatten())
-  model.add(Dense(128, activation="relu", name="dense1"))
-  model.add(Dense(num_classes))
-  model.add(Activation("softmax"))
-
-  opt = flexflow.keras.optimizers.SGD(learning_rate=0.01)
-  model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])
-  
-  conv1s = model.get_layer(index=0)
-  conv2s = model.get_layer(index=1)
-  dense1s = model.get_layer(name="dense1")
-  dense2s = model.get_layer(index=5)
-  
-  conv1s.set_weights(model.ffmodel, c1_kernel, c1_bias)
-  conv2s.set_weights(model.ffmodel, c2_kernel, c2_bias)
-  dense1s.set_weights(model.ffmodel, d1_kernel, d1_bias)
-  dense2s.set_weights(model.ffmodel, d2_kernel, d2_bias)
-  
-  print(model.summary())
-  
-  model.fit(x_train, y_train, epochs=5, callbacks=[VerifyMetrics(ModelAccuracy.MNIST_CNN), EpochVerifyMetrics(ModelAccuracy.MNIST_CNN)])
-   
-def top_level_task():
-  num_classes = 10
-
-  img_rows, img_cols = 28, 28
-  
-  (x_train, y_train), (x_test, y_test) = mnist.load_data()
-  x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
-  
-  x_train = x_train.astype('float32')
-  x_train /= 255
-  y_train = y_train.astype('int32')
-  y_train = np.reshape(y_train, (len(y_train), 1))
-  
-  teacher_model = create_teacher_model_cnn(num_classes, x_train, y_train)
-
-  create_student_model_cnn(teacher_model, num_classes, x_train, y_train)
-
-if __name__ == "__main__":
-  print("Sequential model, mnist mlp teacher student")
-  top_level_task()
diff --git a/examples/python/keras/seq_mnist_mlp.py b/examples/python/keras/seq_mnist_mlp.py
deleted file mode 100644
index 21c7435eb7..0000000000
--- a/examples/python/keras/seq_mnist_mlp.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from flexflow.keras.models import Sequential
-from flexflow.keras.layers import Flatten, Dense, Activation, Dropout
-import flexflow.keras.optimizers
-from flexflow.keras.datasets import mnist
-from flexflow.keras.callbacks import Callback, VerifyMetrics, EpochVerifyMetrics
-from flexflow.keras.initializers import GlorotUniform, Zeros
-
-import flexflow.core as ff
-import numpy as np
-from accuracy import ModelAccuracy
-
-def top_level_task():
-  
-  num_classes = 10
-  
-  (x_train, y_train), (x_test, y_test) = mnist.load_data()
-  
-  x_train = x_train.reshape(60000, 784)
-  x_train = x_train.astype('float32')
-  x_train /= 255
-  y_train = y_train.astype('int32')
-  y_train = np.reshape(y_train, (len(y_train), 1))
-  print("shape: ", x_train.shape)
-  
-  model = Sequential()
-  d1 = Dense(512, input_shape=(784,), kernel_initializer=GlorotUniform(123), bias_initializer=Zeros())
-  model.add(d1)
-  model.add(Activation('relu'))
-  model.add(Dropout(0.2))
-  model.add(Dense(512, activation="relu"))
-  model.add(Dropout(0.2))
-  model.add(Dense(num_classes))
-  model.add(Activation("softmax"))
-
-  opt = flexflow.keras.optimizers.SGD(learning_rate=0.01)
-  model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])
-  
-  print(model.summary())
-
-  model.fit(x_train, y_train, epochs=20, callbacks=[VerifyMetrics(ModelAccuracy.MNIST_MLP), EpochVerifyMetrics(ModelAccuracy.MNIST_MLP)])
-  model.evaluate(x=x_train, y=y_train)
-
-if __name__ == "__main__":
-  print("Sequential model, mnist mlp")
-  top_level_task()
diff --git a/examples/python/keras/seq_mnist_mlp_net2net.py b/examples/python/keras/seq_mnist_mlp_net2net.py
deleted file mode 100644
index 628f76db3a..0000000000
--- a/examples/python/keras/seq_mnist_mlp_net2net.py
+++ /dev/null
@@ -1,96 +0,0 @@
-# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from flexflow.keras.models import Sequential
-from flexflow.keras.layers import Flatten, Dense, Activation, Conv2D, MaxPooling2D
-import flexflow.keras.optimizers
-from flexflow.keras.datasets import mnist
-from flexflow.keras.callbacks import Callback, VerifyMetrics, EpochVerifyMetrics
-
-import flexflow.core as ff
-import numpy as np
-from accuracy import ModelAccuracy
-
-def create_teacher_model_mlp(num_classes, x_train, y_train):
-  model = Sequential()
-  model.add(Dense(512, input_shape=(784,), activation="relu"))
-  model.add(Dense(512, activation="relu"))
-  model.add(Dense(num_classes))
-  model.add(Activation("softmax"))
-
-  opt = flexflow.keras.optimizers.SGD(learning_rate=0.01)
-  model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])
-
-  model.fit(x_train, y_train, epochs=1)
-  
-  dense3 = model.get_layer(index=2)
-  d3_kernel, d3_bias = dense3.get_weights(model.ffmodel)
-  print(d3_bias)
-  d3_kernel = np.reshape(d3_kernel, (d3_kernel.shape[1], d3_kernel.shape[0]))
-  print(d3_kernel)
-  return model
-  
-def create_student_model_mlp(teacher_model, num_classes, x_train, y_train):
-  dense1 = teacher_model.get_layer(index=0)
-  d1_kernel, d1_bias = dense1.get_weights(teacher_model.ffmodel)
-  print(d1_kernel.shape, d1_bias.shape)
-  dense2 = teacher_model.get_layer(index=1)
-  d2_kernel, d2_bias = dense2.get_weights(teacher_model.ffmodel)
-  
-  dense3 = teacher_model.get_layer(index=2)
-  d3_kernel, d3_bias = dense3.get_weights(teacher_model.ffmodel)
-  
-  model = Sequential()
-  model.add(Dense(512, input_shape=(784,), activation="relu"))
-  model.add(Dense(512, activation="relu"))
-  model.add(Dense(num_classes))
-  model.add(Activation("softmax"))
-
-  opt = flexflow.keras.optimizers.SGD(learning_rate=0.01)
-  model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])
-  
-  dense1s = model.get_layer(index=0)
-  dense2s = model.get_layer(index=1)
-  dense3s = model.get_layer(index=2)
-  
-  dense1s.set_weights(model.ffmodel, d1_kernel, d1_bias)
-  dense2s.set_weights(model.ffmodel, d2_kernel, d2_bias)
-  dense3s.set_weights(model.ffmodel, d3_kernel, d3_bias)
-  
-  d3_kernel, d3_bias = dense3s.get_weights(model.ffmodel)
-  print(d3_kernel)
-  print(d3_bias)
-
-  model.fit(x_train, y_train, epochs=5, callbacks=[VerifyMetrics(ModelAccuracy.MNIST_MLP), EpochVerifyMetrics(ModelAccuracy.MNIST_MLP)])
-  
-def top_level_task():
-  num_classes = 10
-
-  (x_train, y_train), (x_test, y_test) = mnist.load_data()
-
-  x_train = x_train.reshape(60000, 784)
-  x_train = x_train.astype('float32')
-  x_train /= 255
-  y_train = y_train.astype('int32')
-  y_train = np.reshape(y_train, (len(y_train), 1))
-  print("shape: ", x_train.shape)
-
-  teacher_model = create_teacher_model_mlp(num_classes, x_train, y_train)
-
-  create_student_model_mlp(teacher_model, num_classes, x_train, y_train)
-
-if __name__ == "__main__":
-  print("Sequential model, mnist mlp teacher student")
-  top_level_task()
diff --git a/examples/python/keras/seq_reuters_mlp.py b/examples/python/keras/seq_reuters_mlp.py
deleted file mode 100644
index 5412ad0599..0000000000
--- a/examples/python/keras/seq_reuters_mlp.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from flexflow.keras.models import Sequential
-from flexflow.keras.layers import Flatten, Dense, Activation, Input
-import flexflow.keras.optimizers
-from flexflow.keras.datasets import reuters
-from flexflow.keras.preprocessing.text import Tokenizer
-from flexflow.keras.callbacks import Callback, VerifyMetrics
-
-import numpy as np
-from accuracy import ModelAccuracy
-
-def top_level_task():
-  
-  max_words = 1000
-  epochs = 5
-  
-  print('Loading data...')
-  (x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=max_words,
-                                                           test_split=0.2)
-  print(len(x_train), 'train sequences')
-  print(len(x_test), 'test sequences')
-
-  num_classes = np.max(y_train) + 1
-  print(num_classes, 'classes')
-  
-  print('Vectorizing sequence data...')
-  tokenizer = Tokenizer(num_words=max_words)
-  x_train = tokenizer.sequences_to_matrix(x_train, mode='binary')
-  x_test = tokenizer.sequences_to_matrix(x_test, mode='binary')
-  x_train = x_train.astype('float32')
-  print('x_train shape:', x_train.shape)
-  print('x_test shape:', x_test.shape)
-
-  y_train = y_train.astype('int32')
-  y_train = np.reshape(y_train, (len(y_train), 1))
-  print('y_train shape:', y_train.shape)
-  
-  model = Sequential()
-  model.add(Input(shape=(max_words,)))
-  model.add(Dense(512, activation="relu"))
-  model.add(Dense(num_classes))
-  model.add(Activation("softmax"))
-
-  opt = flexflow.keras.optimizers.Adam(learning_rate=0.01)
-  model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])
-  print(model.summary())
-
-  model.fit(x_train, y_train, epochs=epochs, callbacks=[VerifyMetrics(ModelAccuracy.REUTERS_MLP)])
-
-if __name__ == "__main__":
-  print("Sequential model, reuters mlp")
-  top_level_task()
diff --git a/examples/python/keras/unary.py b/examples/python/keras/unary.py
deleted file mode 100644
index 622e15dc2d..0000000000
--- a/examples/python/keras/unary.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from flexflow.keras.models import Model, Sequential
-from flexflow.keras.layers import Add, Subtract, Flatten, Dense, Activation, Conv2D, MaxPooling2D, Concatenate, add, subtract, Input
-import flexflow.keras.optimizers
-from flexflow.keras.datasets import mnist
-from flexflow.keras.datasets import cifar10
-
-import flexflow.core as ff
-import numpy as np
-import argparse
-import gc
-
-def add_test():
-  input1 = Input(shape=(16, ), dtype="float32")
-  x1 = Dense(8, activation='relu')(input1)
-  input2 = Input(shape=(32,), dtype="float32")
-  x2 = Dense(8, activation='relu')(input2)
-  subtracted = Add()([x1, x2])
-
-  out = Dense(4)(subtracted)
-  model = Model([input1, input2], out)
-
-  opt = flexflow.keras.optimizers.SGD(learning_rate=0.01)
-  model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])
-  print(model.summary())
-  model.ffmodel.init_layers()
-  
-def subtract_test():
-  input1 = Input(shape=(16, ), dtype="float32")
-  x1 = Dense(8, activation='relu')(input1)
-  input2 = Input(shape=(32, ), dtype="float32")
-  x2 = Dense(8, activation='relu')(input2)
-  subtracted = subtract([x1, x2])
-
-  out = Dense(4)(subtracted)
-  model = Model([input1, input2], out)
-
-  opt = flexflow.keras.optimizers.SGD(learning_rate=0.01)
-  model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])
-  print(model.summary())
-  model.ffmodel.init_layers()
-
-def top_level_task():
-  
-  add_test()
-  subtract_test()
-
-
-if __name__ == "__main__":
-  print("alexnet keras")
-  top_level_task()
\ No newline at end of file
diff --git a/examples/python/keras_exp/func_cifar10_cnn.py b/examples/python/keras_exp/func_cifar10_cnn.py
deleted file mode 100644
index 1d68fc8a14..0000000000
--- a/examples/python/keras_exp/func_cifar10_cnn.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from tensorflow.keras import backend
-
-from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Activation, Input
-from tensorflow.keras import optimizers
-
-from flexflow.keras_exp.models import Model
-from flexflow.keras.datasets import cifar10
-
-import flexflow.core as ff
-import numpy as np
-import argparse
-import gc
-  
-def top_level_task():
-  backend.set_image_data_format('channels_first')
-  num_classes = 10
-  
-  num_samples = 10000
-  
-  (x_train, y_train), (x_test, y_test) = cifar10.load_data(num_samples)
-  
-  x_train = x_train.astype('float32')
-  x_train /= 255
-  y_train = y_train.astype('int32')
-  print("shape: ", x_train.shape)
-  
-  input_tensor1 = Input(shape=(3, 32, 32), dtype="float32")
-  
-  output_tensor = Conv2D(filters=32, input_shape=(3,32,32), kernel_size=(3,3), strides=(1,1), padding="valid", activation="relu")(input_tensor1)
-  output_tensor = Conv2D(filters=32, kernel_size=(3,3), strides=(1,1), padding="valid", activation="relu")(output_tensor)
-  output_tensor = MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="valid")(output_tensor)
-  output_tensor = Conv2D(filters=64, kernel_size=(3,3), strides=(1,1), padding="valid", activation="relu")(output_tensor)
-  output_tensor = Conv2D(filters=64, kernel_size=(3,3), strides=(1,1), padding="valid", activation="relu")(output_tensor)
-  output_tensor = MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="valid")(output_tensor)
-  output_tensor = Flatten()(output_tensor)
-  output_tensor = Dense(512, activation="relu")(output_tensor)
-  output_tensor = Dense(num_classes)(output_tensor)
-  output_tensor = Activation("softmax")(output_tensor)
-
-  model = Model({1: input_tensor1}, output_tensor)
-  
-  opt = optimizers.SGD(learning_rate=0.01)
-  model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])
-  print(model.summary())
-
-  model.fit(x_train, y_train, epochs=1)
-
-if __name__ == "__main__":
-  print("Functional API, cifar10 cnn")
-  top_level_task()
-  gc.collect()
\ No newline at end of file
diff --git a/examples/python/keras_exp/func_cifar10_cnn_concat.py b/examples/python/keras_exp/func_cifar10_cnn_concat.py
deleted file mode 100644
index 0c98c9d10d..0000000000
--- a/examples/python/keras_exp/func_cifar10_cnn_concat.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from tensorflow.keras import backend
-
-from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Activation, Input, Concatenate
-from tensorflow.keras import optimizers
-
-from flexflow.keras_exp.models import Model
-from flexflow.keras.datasets import cifar10
-
-import flexflow.core as ff
-import numpy as np
-import argparse
-import gc
-
-    
-def top_level_task():
-  backend.set_image_data_format('channels_first')
-  num_classes = 10
-  
-  num_samples = 10000
-  
-  (x_train, y_train), (x_test, y_test) = cifar10.load_data(num_samples)
-  
-  x_train = x_train.astype('float32')
-  x_train /= 255
-  y_train = y_train.astype('int32')
-  print("shape: ", x_train.shape)
-  
-  input_tensor1 = Input(shape=(3, 32, 32), dtype="float32")
-  
-  o1 = Conv2D(filters=32, input_shape=(3,32,32), kernel_size=(3,3), strides=(1,1), padding="valid", activation="relu")(input_tensor1)
-  o2 = Conv2D(filters=32, input_shape=(3,32,32), kernel_size=(3,3), strides=(1,1), padding="valid", activation="relu")(input_tensor1)
-  output_tensor = Concatenate(axis=1)([o1, o2])
-  output_tensor = Conv2D(filters=64, kernel_size=(3,3), strides=(1,1), padding="valid", activation="relu")(output_tensor)
-  output_tensor = MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="valid")(output_tensor)
-  output_tensor = Conv2D(filters=64, kernel_size=(3,3), strides=(1,1), padding="valid", activation="relu")(output_tensor)
-  output_tensor = Conv2D(filters=64, kernel_size=(3,3), strides=(1,1), padding="valid", activation="relu")(output_tensor)
-  output_tensor = MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="valid")(output_tensor)
-  output_tensor = Flatten()(output_tensor)
-  output_tensor = Dense(512, activation="relu")(output_tensor)
-  output_tensor = Dense(num_classes)(output_tensor)
-  output_tensor = Activation("softmax")(output_tensor)
-
-  model = Model({1: input_tensor1}, output_tensor)
-  
-  opt = optimizers.SGD(learning_rate=0.01)
-  model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])
-  print(model.summary())
-
-  model.fit(x_train, y_train, epochs=1)
-
-if __name__ == "__main__":
-  print("Functional API, cifar10 cnn")
-  top_level_task()
-  gc.collect()
\ No newline at end of file
diff --git a/examples/python/keras_exp/func_cifar10_cnn_nested.py b/examples/python/keras_exp/func_cifar10_cnn_nested.py
deleted file mode 100644
index 8ae20519dd..0000000000
--- a/examples/python/keras_exp/func_cifar10_cnn_nested.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from tensorflow.keras import backend
-from tensorflow.keras.layers import Input, Flatten, Dense, Activation, Conv2D, MaxPooling2D, Concatenate, concatenate
-from tensorflow.keras import optimizers
-
-from flexflow.keras_exp.models import Model
-from flexflow.keras.datasets import cifar10
-
-import flexflow.core as ff
-import numpy as np
-import argparse
-import gc
-  
-def top_level_task():
-  backend.set_image_data_format('channels_first')
-  num_classes = 10
-  
-  num_samples = 10000
-  
-  (x_train, y_train), (x_test, y_test) = cifar10.load_data(num_samples)
-  
-  x_train = x_train.astype('float32')
-  x_train /= 255
-  y_train = y_train.astype('int32')
-  print("shape: ", x_train.shape)
-  
-  input_tensor1 = Input(shape=(3, 32, 32), dtype="float32")
-  output_tensor1 = Conv2D(filters=32, kernel_size=(3,3), strides=(1,1), padding="valid", activation="relu")(input_tensor1)
-  output_tensor1 = Conv2D(filters=32, kernel_size=(3,3), strides=(1,1), padding="valid", activation="relu")(output_tensor1)
-  output_tensor1 = MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="valid")(output_tensor1)
-  model1 = Model(input_tensor1, output_tensor1)
-  
-  input_tensor2 = Input(shape=(32, 14, 14), dtype="float32")
-  output_tensor2 = Conv2D(filters=64, kernel_size=(3,3), strides=(1,1), padding="valid", activation="relu")(input_tensor2)
-  output_tensor2 = Conv2D(filters=64, kernel_size=(3,3), strides=(1,1), padding="valid", activation="relu")(output_tensor2)
-  output_tensor2 = MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="valid")(output_tensor2)
-  output_tensor2 = Flatten()(output_tensor2)
-  output_tensor2 = Dense(512, activation="relu")(output_tensor2)
-  output_tensor2 = Dense(num_classes)(output_tensor2)
-  output_tensor2 = Activation("softmax")(output_tensor2)
-  model2 = Model(input_tensor2, output_tensor2)
-  
-  input_tensor3 = Input(shape=(3, 32, 32), dtype="float32")
-  output_tensor3 = model1(input_tensor3)
-  output_tensor3 = model2(output_tensor3)
-  model = Model({3: input_tensor3}, output_tensor3)
-  
-  opt = optimizers.SGD(learning_rate=0.01)
-  model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])
-  print(model.summary())
-
-  model.fit(x_train, y_train, epochs=1)
-
-if __name__ == "__main__":
-  print("Functional API, cifar10 cnn nested")
-  top_level_task()
-  gc.collect()
\ No newline at end of file
diff --git a/examples/python/keras_exp/func_mnist_mlp.py b/examples/python/keras_exp/func_mnist_mlp.py
deleted file mode 100644
index 23734ce27e..0000000000
--- a/examples/python/keras_exp/func_mnist_mlp.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-from tensorflow.keras import backend
-
-from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Activation, Input
-from tensorflow.keras import optimizers
-
-from flexflow.keras_exp.models import Model
-
-from flexflow.keras.datasets import mnist
-
-import flexflow.core as ff
-import numpy as np
-import argparse
-import gc
-  
-def top_level_task():
-  
-  backend.set_image_data_format('channels_first')
-
-  num_classes = 10
-  
-  (x_train, y_train), (x_test, y_test) = mnist.load_data()
-
-  x_train = x_train.reshape(60000, 784)
-  x_train = x_train.astype('float32')
-  x_train /= 255
-  y_train = y_train.astype('int32')
-  y_train = np.reshape(y_train, (len(y_train), 1))
-  print("shape: ", x_train.shape)
-
-  input_tensor = Input(shape=(784))
-  output = Dense(512, activation="relu")(input_tensor)
-  output = Dense(512, activation="relu")(output)
-  output = Dense(num_classes)(output)
-  output = Activation("softmax")(output)
-  model = Model(inputs={1: input_tensor}, outputs=output)
-  
-  print(model.summary())
-  
-  opt = optimizers.SGD(learning_rate=0.01)
-  model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])
-  model.fit(x_train, y_train, batch_size=64, epochs=1)
-  
-
-if __name__ == "__main__":
-  print("Functional API, mnist mlp")
-  top_level_task()
-  gc.collect()
\ No newline at end of file
diff --git a/examples/python/keras_exp/func_mnist_mlp_concat.py b/examples/python/keras_exp/func_mnist_mlp_concat.py
deleted file mode 100644
index cc0a6c24d3..0000000000
--- a/examples/python/keras_exp/func_mnist_mlp_concat.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-from tensorflow.keras import backend
-from tensorflow.keras.layers import Input, Flatten, Dense, Activation, Conv2D, MaxPooling2D, Concatenate, concatenate
-from tensorflow.keras import optimizers
-
-from flexflow.keras_exp.models import Model
-from flexflow.keras.datasets import mnist
-
-import flexflow.core as ff
-import numpy as np
-import argparse
-import gc
-  
-def top_level_task():
-  backend.set_image_data_format('channels_first')
-  num_classes = 10
-
-  (x_train, y_train), (x_test, y_test) = mnist.load_data()
-
-  x_train = x_train.reshape(60000, 784)
-  x_train = x_train.astype('float32')
-  x_train /= 255
-  y_train = y_train.astype('int32')
-  y_train = np.reshape(y_train, (len(y_train), 1))
-  print("shape: ", x_train.shape)
-  
-  input_tensor1 = Input(shape=(784,))
-  input_tensor2 = Input(shape=(784,))
-  input_tensor3 = Input(shape=(784,))
-  input_tensor4 = Input(shape=(784,))
-  
-  t1 = Dense(512, activation="relu", name="dense1")(input_tensor1)
-  t1 = Dense(512, activation="relu", name="dense12")(t1)
-  model1 = Model(input_tensor1, t1)
-  t2 = Dense(512, activation="relu", name="dense2")(input_tensor2)
-  t2 = Dense(512, activation="relu", name="dense22")(t2)
-  model2 = Model(input_tensor2, t2)
-  t3 = Dense(512, activation="relu", name="dense3")(input_tensor3)
-  t3 = Dense(512, activation="relu", name="dense33")(t3)
-  model3 = Model(input_tensor3, t3)
-  t4 = Dense(512, activation="relu", name="dense4")(input_tensor4)
-  t4 = Dense(512, activation="relu", name="dense44")(t4)
-  model4 = Model(input_tensor4, t4)
-  
-  input_tensor1 = Input(shape=(784,))
-  input_tensor2 = Input(shape=(784,))
-  t1 = model1(input_tensor1)
-  t2 = model2(input_tensor1)
-  t3 = model3(input_tensor2)
-  t4 = model4(input_tensor2)
-  output = Concatenate(axis=1)([t1, t2, t3, t4])
-  output = Dense(num_classes)(output)
-  output = Activation("softmax")(output)
-  
-  model = Model({5: input_tensor1, 6: input_tensor2}, output)
-
-  opt = optimizers.SGD(learning_rate=0.01)
-  model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])
-  
-  print(model.summary())
-
-  model.fit([x_train, x_train], y_train, epochs=1)
-
-if __name__ == "__main__":
-  print("Functional API, mnist mlp concat")
-  top_level_task()
-  gc.collect()
\ No newline at end of file
diff --git a/examples/python/native/__init__.py b/examples/python/native/__init__.py
deleted file mode 100644
index 5f0938035a..0000000000
--- a/examples/python/native/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .accuracy import ModelAccuracy
diff --git a/examples/python/native/accuracy.py b/examples/python/native/accuracy.py
deleted file mode 100644
index 30b15402f4..0000000000
--- a/examples/python/native/accuracy.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from enum import Enum
-
-class ModelAccuracy(Enum):
-  MNIST_MLP = 90
-  MNIST_CNN = 90
-  REUTERS_MLP = 90
-  CIFAR10_CNN = 90
-  CIFAR10_ALEXNET = 90
diff --git a/examples/python/native/alexnet.py b/examples/python/native/alexnet.py
deleted file mode 100644
index 61397cefc1..0000000000
--- a/examples/python/native/alexnet.py
+++ /dev/null
@@ -1,142 +0,0 @@
-from flexflow.core import *
-from flexflow.keras.datasets import cifar10
-
-from accuracy import ModelAccuracy
-from PIL import Image
-import argparse
-import numpy as np
-
-
-def top_level_task():
-    ffconfig = FFConfig()
-    alexnetconfig = NetConfig()
-    print(alexnetconfig.dataset_path)
-    print("Python API batchSize(%d) workersPerNodes(%d) numNodes(%d)" % (
-        ffconfig.batch_size, ffconfig.workers_per_node, ffconfig.num_nodes))
-    ffmodel = FFModel(ffconfig)
-
-    dims_input = [ffconfig.batch_size, 3, 229, 229]
-    input_tensor = ffmodel.create_tensor(dims_input, DataType.DT_FLOAT)
-
-    kernel_init = GlorotUniformInitializer(123)
-    bias_init = ZeroInitializer()
-    t = ffmodel.conv2d(input_tensor, 64, 11, 11, 4, 4, 2, 2,
-                       ActiMode.AC_MODE_RELU, 1, True, None, kernel_init, bias_init)
-    t = ffmodel.pool2d(t, 3, 3, 2, 2, 0, 0)
-    t = ffmodel.conv2d(t, 192, 5, 5, 1, 1, 2, 2, ActiMode.AC_MODE_RELU)
-    t = ffmodel.pool2d(t, 3, 3, 2, 2, 0, 0)
-    t = ffmodel.conv2d(t, 384, 3, 3, 1, 1, 1, 1, ActiMode.AC_MODE_RELU)
-    t = ffmodel.conv2d(t, 256, 3, 3, 1, 1, 1, 1, ActiMode.AC_MODE_RELU)
-    t = ffmodel.conv2d(t, 256, 3, 3, 1, 1, 1, 1, ActiMode.AC_MODE_RELU)
-    t = ffmodel.pool2d(t, 3, 3, 2, 2, 0, 0)
-    t = ffmodel.flat(t)
-    t = ffmodel.dense(t, 4096, ActiMode.AC_MODE_RELU)
-    t = ffmodel.dense(t, 4096, ActiMode.AC_MODE_RELU)
-    t = ffmodel.dense(t, 10)
-    t = ffmodel.softmax(t)
-
-    ffoptimizer = SGDOptimizer(ffmodel, 0.01)
-    ffmodel.optimizer = ffoptimizer
-    ffmodel.compile(loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, metrics=[
-                    MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
-    label_tensor = ffmodel.label_tensor
-
-    num_samples = 10000
-
-    (x_train, y_train), (x_test, y_test) = cifar10.load_data(num_samples)
-
-    full_input_np = np.zeros((num_samples, 3, 229, 229), dtype=np.float32)
-
-    for i in range(0, num_samples):
-        image = x_train[i, :, :, :]
-        image = image.transpose(1, 2, 0)
-        pil_image = Image.fromarray(image)
-        pil_image = pil_image.resize((229, 229), Image.Resampling.NEAREST)
-        image = np.array(pil_image, dtype=np.float32)
-        image = image.transpose(2, 0, 1)
-        full_input_np[i, :, :, :] = image
-        if (i == 0):
-            print(image)
-
-    full_input_np /= 255
-    print(full_input_np.shape)
-    print(full_input_np.__array_interface__["strides"])
-    print(full_input_np[0, :, :, :])
-
-    y_train = y_train.astype('int32')
-    full_label_np = y_train
-
-    dataloader_input = ffmodel.create_data_loader(input_tensor, full_input_np)
-    dataloader_label = ffmodel.create_data_loader(label_tensor, full_label_np)
-
-    num_samples = dataloader_input.num_samples
-    assert dataloader_input.num_samples == dataloader_label.num_samples
-
-    ffmodel.init_layers()
-
-    epochs = ffconfig.epochs
-
-    ts_start = ffconfig.get_current_time()
-
-    ffmodel.fit(x=dataloader_input, y=dataloader_label, epochs=epochs)
-
-    ts_end = ffconfig.get_current_time()
-    run_time = 1e-6 * (ts_end - ts_start)
-    print("epochs %d, ELAPSED TIME = %.4fs, THROUGHPUT = %.2f samples/s\n" %
-          (epochs, run_time, num_samples * epochs / run_time))
-    perf_metrics = ffmodel.get_perf_metrics()
-
-    return perf_metrics
-    accuracy = perf_metrics.get_accuracy()
-    if accuracy < ModelAccuracy.CIFAR10_ALEXNET.value:
-        assert 0, 'Check Accuracy'
-
-    # conv_2d1 = ffmodel.get_layer_by_id(0)
-    # cbias_tensor = conv_2d1.get_input_tensor()
-    # cbias_tensor.inline_map(ffconfig)
-    # cbias = cbias_tensor.get_flat_array(ffconfig, DataType.DT_FLOAT)
-    # print(cbias.shape)
-    # print(cbias)
-    # #save_image(cbias, 2)
-    # cbias_tensor.inline_unmap(ffconfig)
-
-    # label.inline_map(ffconfig)
-    # label_array = label.get_flat_array(ffconfig, DataType.DT_INT32)
-    # print(label_array.shape)
-    # # print(cbias)
-    # print(label_array)
-    # label.inline_unmap(ffconfig)
-
-    # ffmodel.print_layers(0)
-
-
-def save_image(batch_image_array, id):
-    image_array = batch_image_array[id, :, :, :]
-    image_array = image_array.transpose(1, 2, 0)
-    image_array = image_array*255
-    image_array = image_array.astype('uint8')
-    pil_image = Image.fromarray(image_array).convert('RGB')
-    pil_image.save("img.jpeg")
-
-
-def test_accuracy():
-    perf_metrics = top_level_task()
-    accuracy = perf_metrics.get_accuracy()
-    try:
-        assert (
-            accuracy >= ModelAccuracy.CIFAR10_ALEXNET.value), "Accuracy less than 90%"
-    except AssertionError as e:
-        raise
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-a", "--test_acc",
-                        action="store_true", help="Test accuracy flag")
-    args, unknown = parser.parse_known_args()
-    if args.test_acc:
-        print("Testing cifar10 alexnet training accuracy")
-        test_accuracy()
-    else:
-        print("cifar10 alexnet")
-        top_level_task()
diff --git a/examples/python/native/bert_proxy_native.py b/examples/python/native/bert_proxy_native.py
deleted file mode 100644
index a1c59a83d4..0000000000
--- a/examples/python/native/bert_proxy_native.py
+++ /dev/null
@@ -1,123 +0,0 @@
-from flexflow.core import *
-
-from argparse import ArgumentParser
-
-import sys
-import numpy as np
-
-def parse_args():
-    print(sys.argv)
-    parser = ArgumentParser()
-
-# BERT-large
-    parser.add_argument('--seq-length', default=512, type=int)
-    parser.add_argument('--num-heads', default=16, type=int)
-    parser.add_argument('--hidden-size', default=1024, type=int)
-    parser.add_argument('--num_layers', default=24, type=int)
-    parser.add_argument('--iterations', default=10, type=int)
-
-    args, unknown = parser.parse_known_args()
-
-    return args
-
-def mha(model, q, k, v, batch_size, seq_length, hidden_size, n_heads, kdim, vdim, act=ActiMode.AC_MODE_GELU):
-    q = model.dense(q, hidden_size)
-    k = model.dense(k, hidden_size)
-    v = model.dense(v, hidden_size)
-
-    q = model.reshape(q, shape=(batch_size, seq_length, n_heads, kdim))
-    k = model.reshape(k, shape=(batch_size, seq_length, n_heads, kdim))
-    v = model.reshape(v, shape=(batch_size, seq_length, n_heads, vdim))
-    q = model.transpose(q, perm=(0, 2, 1, 3))
-    k = model.transpose(k, perm=(0, 2, 3, 1))
-    v = model.transpose(v, perm=(0, 2, 1, 3))
-    logits = model.batch_matmul(q, k, a_seq_length_dim=2,b_seq_length_dim=3)
-    #logits = model.softmax(logits)
-    output = model.batch_matmul(logits, v, a_seq_length_dim=3,b_seq_length_dim=2)
-    output = model.transpose(output, perm=(0, 2, 1, 3))
-    output = model.reshape(output, shape=(batch_size, seq_length, hidden_size))
-    output = model.dense(output, hidden_size, act)
-#    output = model.dense(output, hidden_size)
-    return output
-
-def create_bert_layer(model, input, batch_size, seq_length, hidden_size, n_heads, kdim, vdim, act=ActiMode.AC_MODE_GELU):
-    t = input
-    # MHA
-#    t = model.multihead_attention(
-#            t, t, t,
-#            hidden_size, n_heads, kdim, vdim)
-    t = mha(model, t, t, t, batch_size, seq_length, hidden_size, n_heads, kdim, vdim)
-    # t = model.dense(input, hidden_size, act)
-    t = model.dense(t, hidden_size, act)
-    # t = model.dropout(t)
-    t = model.add(t, input)
-
-    # Intermediate
-    intermediate_out = model.dense(t, hidden_size, act)
-
-    # Output
-    t = model.dense(intermediate_out, hidden_size, act)
-    # t = model.dropout(t)
-    t = model.add(t, intermediate_out)
-
-    return t
-
-def top_level_task():
-    args = parse_args()
-
-    ffconfig = FFConfig()
-    netconfig = NetConfig()
-
-    print("Python API batchSize(%d) workersPerNodes(%d) numNodes(%d)"
-          %(ffconfig.batch_size, ffconfig.workers_per_node, ffconfig.num_nodes))
-
-    ffmodel = FFModel(ffconfig)
-
-    batch_size = ffconfig.batch_size
-    seq_length = args.seq_length
-    hidden_size = args.hidden_size
-    num_heads = args.num_heads
-    num_layers = args.num_layers
-    kdim = hidden_size // num_heads
-    vdim = hidden_size // num_heads
-
-    print('Model config:')
-    print(f"  seq_length: {seq_length}")
-    print(f"  hidden_size: {hidden_size}")
-    print(f"  num_heads: {num_heads}")
-    print(f"  kdim: {kdim}")
-    print(f"  vdim: {vdim}")
-
-    dims_input = [batch_size, seq_length, hidden_size]
-    input_tensor = ffmodel.create_tensor(dims_input, DataType.DT_FLOAT)
-    np_input_tensor = np.zeros(dims_input, dtype=np.float32)
-    input_tensor.set_tensor(ffmodel, np_input_tensor)
-    #input_tensor.attach_numpy_array(ffconfig, np_input_tensor)
-    #input_tensor.detach_numpy_array(ffconfig)
-
-    # build the model
-    t = input_tensor
-    for _ in range(num_layers):
-        t = create_bert_layer(ffmodel, t, batch_size, seq_length, hidden_size, num_heads, kdim, vdim)
-
-    # t now contains entire model. Add single-neuron output
-    t = ffmodel.dense(t, 1)
-
-    optimizer = SGDOptimizer(ffmodel, 1e-3)
-    ffmodel.optimizer = optimizer
-    ffmodel.compile(loss_type=LossType.LOSS_MEAN_SQUARED_ERROR_AVG_REDUCE, metrics=[MetricsType.METRICS_ACCURACY], comp_mode=CompMode.INFERENCE)
-    ffmodel.init_layers()
-    ts_start = ffconfig.get_current_time()
-
-    iterations = args.iterations
-    for it in range(iterations):
-#        print(f" ITERATION: {it}")
-        ffconfig.begin_trace(111)
-        ffmodel.forward(seq_length=it)
-        ffconfig.end_trace(111)
-    ts_end = ffconfig.get_current_time()
-    print(f" Time taken to run forward pass: {(ts_end - ts_start)/iterations}")
-
-if __name__ == "__main__":
-    print("BERT Proxy")
-    top_level_task()
diff --git a/examples/python/native/bert_proxy_run_script.sh b/examples/python/native/bert_proxy_run_script.sh
deleted file mode 100755
index e160fc8124..0000000000
--- a/examples/python/native/bert_proxy_run_script.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/bash
-
-ulimit -l unlimited
-
-fs=12000
-zs=12000
-b=1
-g=1
-budget=50000
-
-#<7.5B
-script='bert_proxy_native.py -ll:py 1 --iterations 100 --seq-length 128 --num-heads 32 --hidden-size 4096 --num_layers 36'
-ss=7.5b.l18
-
-#$FF_HOME/python/flexflow_python $script -ll:gpu 1 -ll:fsize $fs -ll:zsize $zs -b $b --budget 1000 --export ./$ss.b$b.g1.bg$budget
-#$FF_HOME/python/flexflow_python $script -ll:gpu $g -ll:fsize $fs -ll:zsize $zs -b $b --budget $budget --enable-parameter-parallel --enable-attribute-parallel --export ./$ss.b$b.g$g.bg$budget --import ./$ss.b$b.g1.bg$budget --taskgraph ./tg.$ss.b$b.g$g.bg$budget
-
-"$FF_HOME"/python/flexflow_python "$script" -ll:gpu "$g" -ll:fsize "$fs" -ll:zsize "$zs" -b "$b"  --enable-parameter-parallel --enable-attribute-parallel --import "./$ss.b$b.g$g.bg$budget"
-#-lg:prof 1 -logfile spy_$ss.%.log -lg:spy -lg:prof_logfile prof_$ss.%.gz
diff --git a/examples/python/native/cifar10_cnn.py b/examples/python/native/cifar10_cnn.py
deleted file mode 100644
index 44bdce4519..0000000000
--- a/examples/python/native/cifar10_cnn.py
+++ /dev/null
@@ -1,99 +0,0 @@
-from flexflow.core import *
-from flexflow.keras.datasets import cifar10
-
-from accuracy import ModelAccuracy
-import argparse
-
-
-def top_level_task():
-    ffconfig = FFConfig()
-    alexnetconfig = NetConfig()
-    print(alexnetconfig.dataset_path)
-    print("Python API batchSize(%d) workersPerNodes(%d) numNodes(%d)" % (
-        ffconfig.batch_size, ffconfig.workers_per_node, ffconfig.num_nodes))
-    ffmodel = FFModel(ffconfig)
-
-    dims_input = [ffconfig.batch_size, 3, 32, 32]
-    input_tensor = ffmodel.create_tensor(dims_input, DataType.DT_FLOAT)
-
-    t = ffmodel.conv2d(input_tensor, 32, 3, 3, 1, 1,
-                       1, 1, ActiMode.AC_MODE_RELU)
-    t = ffmodel.conv2d(t, 32, 3, 3, 1, 1, 1, 1, ActiMode.AC_MODE_RELU)
-    t = ffmodel.pool2d(t, 2, 2, 2, 2, 0, 0,)
-    t = ffmodel.conv2d(t, 64, 3, 3, 1, 1, 1, 1, ActiMode.AC_MODE_RELU)
-    t = ffmodel.conv2d(t, 64, 3, 3, 1, 1, 1, 1, ActiMode.AC_MODE_RELU)
-    t = ffmodel.pool2d(t, 2, 2, 2, 2, 0, 0)
-    t = ffmodel.flat(t)
-    t = ffmodel.dense(t, 512, ActiMode.AC_MODE_RELU)
-    t = ffmodel.dense(t, 10)
-    t = ffmodel.softmax(t)
-
-    ffoptimizer = SGDOptimizer(ffmodel, 0.01)
-    ffmodel.optimizer = ffoptimizer
-    ffmodel.compile(loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, metrics=[
-                    MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
-    label_tensor = ffmodel.label_tensor
-
-    num_samples = 10000
-
-    (x_train, y_train), (x_test, y_test) = cifar10.load_data(num_samples)
-
-    x_train = x_train.astype('float32')
-    x_train /= 255
-    full_input_array = x_train
-    print(full_input_array.__array_interface__["strides"])
-
-    y_train = y_train.astype('int32')
-    full_label_array = y_train
-
-    print(full_input_array.__array_interface__["strides"])
-    print(full_input_array.shape, full_label_array.shape)
-    # print(full_input_array[0,:,:,:])
-    #print(full_label_array[0, 0:64])
-    print(full_label_array.__array_interface__["strides"])
-
-    dataloader_input = ffmodel.create_data_loader(
-        input_tensor, full_input_array)
-    dataloader_label = ffmodel.create_data_loader(
-        label_tensor, full_label_array)
-
-    num_samples = dataloader_input.num_samples
-
-    ffmodel.init_layers()
-
-    epochs = ffconfig.epochs
-
-    ts_start = ffconfig.get_current_time()
-
-    ffmodel.fit(x=dataloader_input, y=dataloader_label, epochs=epochs)
-
-    ts_end = ffconfig.get_current_time()
-    run_time = 1e-6 * (ts_end - ts_start)
-    print("epochs %d, ELAPSED TIME = %.4fs, THROUGHPUT = %.2f samples/s\n" %
-          (epochs, run_time, num_samples * epochs / run_time))
-
-    perf_metrics = ffmodel.get_perf_metrics()
-
-    return perf_metrics
-
-
-def test_accuracy():
-    perf_metrics = top_level_task()
-    accuracy = perf_metrics.get_accuracy()
-    try:
-        assert (accuracy >= ModelAccuracy.CIFAR10_CNN.value), "Accuracy less than 90%"
-    except AssertionError as e:
-        raise
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-a", "--test_acc",
-                        action="store_true", help="Test accuracy flag")
-    args, unknown = parser.parse_known_args()
-    if args.test_acc:
-        print("Testing cifar10 cnn training accuracy")
-        test_accuracy()
-    else:
-        print("cifar10 cnn")
-        top_level_task()
diff --git a/examples/python/native/cifar10_cnn_attach.py b/examples/python/native/cifar10_cnn_attach.py
deleted file mode 100644
index ba4288c8cd..0000000000
--- a/examples/python/native/cifar10_cnn_attach.py
+++ /dev/null
@@ -1,147 +0,0 @@
-# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from flexflow.core import *
-from flexflow.keras.datasets import cifar10
-from accuracy import ModelAccuracy
-
-def next_batch(idx, x_train, input1, ffconfig, ffmodel):
-  start = idx*ffconfig.batch_size
-  x_train_batch = x_train[start:start+ffconfig.batch_size, :, :, :]
-  print(x_train_batch.shape)
-
-  # input1.inline_map(ffconfig)
-  # input_array = input1.get_array(ffconfig, DataType.DT_FLOAT)
-  # print(input_array.shape)
-  # for i in range(0, ffconfig.batch_size):
-  #   for j in range(0, 3):
-  #     for k in range(0, 32):
-  #       for l in range(0, 32):
-  #         input_array[i][j][k][l] = x_train_batch[i][j][k][l]
-  # input1.inline_unmap(ffconfig)
-
-  input1.set_tensor(ffmodel, x_train_batch)
-
-def next_batch_label(idx, x_train, input1, ffconfig, ffmodel):
-  start = idx*ffconfig.batch_size
-  x_train_batch = x_train[start:start+ffconfig.batch_size, :]
-  print(x_train_batch.shape)
-
-  # input1.inline_map(ffconfig)
-  # input_array = input1.get_array(ffconfig, DataType.DT_INT32)
-  # print(input_array.shape)
-  # for i in range(0, ffconfig.batch_size):
-  #   for j in range(0, 1):
-  #     input_array[i][j] = x_train_batch[i][j]
-  # input1.inline_unmap(ffconfig)
-  input1.set_tensor(ffmodel, x_train_batch)
-
-def top_level_task():
-  ffconfig = FFConfig()
-  alexnetconfig = NetConfig()
-  print(alexnetconfig.dataset_path)
-  print("Python API batchSize(%d) workersPerNodes(%d) numNodes(%d)" %(ffconfig.batch_size, ffconfig.workers_per_node, ffconfig.num_nodes))
-  ffmodel = FFModel(ffconfig)
-
-  dims_input = [ffconfig.batch_size, 3, 32, 32]
-  input_tensor = ffmodel.create_tensor(dims_input, DataType.DT_FLOAT)
-
-  num_samples = 10000
-
-  (x_train, y_train), (x_test, y_test) = cifar10.load_data(num_samples)
-
-  x_train = x_train.astype('float32')
-  x_train /= 255
-
-  full_input_array = x_train
-  print(full_input_array.__array_interface__["strides"])
-
-  y_train = y_train.astype('int32')
-
-  full_label_array = y_train
-
-  print(full_input_array.__array_interface__["strides"])
-  print(full_input_array.shape, full_label_array.shape)
-  print(full_label_array.__array_interface__["strides"])
-
-  t = ffmodel.conv2d(input_tensor, 32, 3, 3, 1, 1, 1, 1, ActiMode.AC_MODE_RELU)
-  t = ffmodel.conv2d(t, 32, 3, 3, 1, 1, 1, 1, ActiMode.AC_MODE_RELU)
-  t = ffmodel.pool2d(t, 2, 2, 2, 2, 0, 0,)
-  t = ffmodel.conv2d(t, 64, 3, 3, 1, 1, 1, 1, ActiMode.AC_MODE_RELU)
-  t = ffmodel.conv2d(t, 64, 3, 3, 1, 1, 1, 1, ActiMode.AC_MODE_RELU)
-  t = ffmodel.pool2d(t, 2, 2, 2, 2, 0, 0)
-  t = ffmodel.flat(t);
-  t = ffmodel.dense(t, 512, ActiMode.AC_MODE_RELU)
-  t = ffmodel.dense(t, 10)
-  t = ffmodel.softmax(t)
-
-  ffoptimizer = SGDOptimizer(ffmodel, 0.01)
-  ffmodel.optimizer = ffoptimizer
-  ffmodel.compile(loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
-  label_tensor = ffmodel.label_tensor
-
-  next_batch(0, x_train, input_tensor, ffconfig, ffmodel)
-  next_batch_label(0, y_train, label_tensor, ffconfig, ffmodel)
-
-  ffmodel.init_layers()
-
-  epochs = ffconfig.epochs
-
-  ts_start = ffconfig.get_current_time()
-  for epoch in range(0,epochs):
-    ffmodel.reset_metrics()
-    iterations = int(num_samples / ffconfig.batch_size)
-    print(iterations, num_samples)
-    ct = 0
-    for iter in range(0, int(iterations)):
-      #ffconfig.begin_trace(111)
-      next_batch(ct, x_train, input_tensor, ffconfig, ffmodel)
-      next_batch_label(ct, y_train, label_tensor, ffconfig, ffmodel)
-      ct += 1
-      ffmodel.forward()
-      ffmodel.zero_gradients()
-      ffmodel.backward()
-      ffmodel.update()
-      #ffconfig.end_trace(111)
-
-  ts_end = ffconfig.get_current_time()
-  run_time = 1e-6 * (ts_end - ts_start);
-  print("epochs %d, ELAPSED TIME = %.4fs, THROUGHPUT = %.2f samples/s\n" %(epochs, run_time, num_samples * epochs / run_time));
-
-  perf_metrics = ffmodel.get_perf_metrics()
-  accuracy = perf_metrics.get_accuracy()
-  if accuracy < 0.3:
-    assert 0, 'Check Accuracy'
-
-  conv_2d1 = ffmodel.get_layer_by_id(0)
-  cbias_tensor = conv_2d1.get_input_tensor()
-  #cbias_tensor = conv_2d1.get_output_tensor()
-  cbias_tensor.inline_map(ffmodel, ffconfig)
-  cbias = cbias_tensor.get_flat_array(ffmodel, ffconfig)
-  print(cbias.shape)
-  print(cbias)
-  cbias_tensor.inline_unmap(ffmodel, ffconfig)
-
-  label_tensor.inline_map(ffmodel, ffconfig)
-  label_array = label_tensor.get_flat_array(ffmodel, ffconfig)
-  print(label_array.shape)
-  # print(cbias)
-  print(label_array)
-  label_tensor.inline_unmap(ffmodel, ffconfig)
-
-
-if __name__ == "__main__":
-  print("cifar10 cnn attach")
-  top_level_task()
diff --git a/examples/python/native/cifar10_cnn_concat.py b/examples/python/native/cifar10_cnn_concat.py
deleted file mode 100644
index b177295ad6..0000000000
--- a/examples/python/native/cifar10_cnn_concat.py
+++ /dev/null
@@ -1,75 +0,0 @@
-from flexflow.core import *
-from flexflow.keras.datasets import cifar10
-
-from accuracy import ModelAccuracy
-
-def top_level_task():
-  ffconfig = FFConfig()
-  alexnetconfig = NetConfig()
-  print(alexnetconfig.dataset_path)
-  print("Python API batchSize(%d) workersPerNodes(%d) numNodes(%d)" %(ffconfig.batch_size, ffconfig.workers_per_node, ffconfig.num_nodes))
-  ffmodel = FFModel(ffconfig)
-
-  dims_input = [ffconfig.batch_size, 3, 32, 32]
-  input_tensor = ffmodel.create_tensor(dims_input, DataType.DT_FLOAT)
-
-  t1 = ffmodel.conv2d(input_tensor, 32, 3, 3, 1, 1, 1, 1, ActiMode.AC_MODE_RELU)
-  t1 = ffmodel.conv2d(t1, 32, 3, 3, 1, 1, 1, 1, ActiMode.AC_MODE_RELU)
-  t2 = ffmodel.conv2d(input_tensor, 32, 3, 3, 1, 1, 1, 1, ActiMode.AC_MODE_RELU)
-  t2 = ffmodel.conv2d(t2, 32, 3, 3, 1, 1, 1, 1, ActiMode.AC_MODE_RELU)
-  t3 = ffmodel.conv2d(input_tensor, 32, 3, 3, 1, 1, 1, 1, ActiMode.AC_MODE_RELU)
-  t3 = ffmodel.conv2d(t3, 32, 3, 3, 1, 1, 1, 1, ActiMode.AC_MODE_RELU)
-  t = ffmodel.concat([t1, t2, t3], 1)
-  t = ffmodel.pool2d(t, 2, 2, 2, 2, 0, 0,)
-  t1 = ffmodel.conv2d(t, 64, 3, 3, 1, 1, 1, 1, ActiMode.AC_MODE_RELU)
-  t2 = ffmodel.conv2d(t, 64, 3, 3, 1, 1, 1, 1, ActiMode.AC_MODE_RELU)
-  t = ffmodel.concat([t1, t2], 1)
-  t = ffmodel.conv2d(t, 64, 3, 3, 1, 1, 1, 1, ActiMode.AC_MODE_RELU)
-  t = ffmodel.pool2d(t, 2, 2, 2, 2, 0, 0)
-  t = ffmodel.flat(t);
-  t = ffmodel.dense(t, 512, ActiMode.AC_MODE_RELU)
-  t = ffmodel.dense(t, 10)
-  t = ffmodel.softmax(t)
-
-  ffoptimizer = SGDOptimizer(ffmodel, 0.01)
-  ffmodel.optimizer = ffoptimizer
-  ffmodel.compile(loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
-  label_tensor = ffmodel.label_tensor
-
-  num_samples = 10000
-
-  (x_train, y_train), (x_test, y_test) = cifar10.load_data(num_samples)
-
-  x_train = x_train.astype('float32')
-  x_train /= 255
-  full_input_array = x_train
-  print(full_input_array.__array_interface__["strides"])
-
-  y_train = y_train.astype('int32')
-  full_label_array = y_train
-
-  dataloader_input = ffmodel.create_data_loader(input_tensor, full_input_array)
-  dataloader_label = ffmodel.create_data_loader(label_tensor, full_label_array)
-
-  num_samples = dataloader_input.num_samples
-
-  ffmodel.init_layers()
-
-  epochs = ffconfig.epochs
-
-  ts_start = ffconfig.get_current_time()
-
-  ffmodel.fit(x=dataloader_input, y=dataloader_label, epochs=epochs)
-
-  ts_end = ffconfig.get_current_time()
-  run_time = 1e-6 * (ts_end - ts_start);
-  print("epochs %d, ELAPSED TIME = %.4fs, THROUGHPUT = %.2f samples/s\n" %(epochs, run_time, num_samples * epochs / run_time));
-
-  perf_metrics = ffmodel.get_perf_metrics()
-  accuracy = perf_metrics.get_accuracy()
-  if accuracy < ModelAccuracy.CIFAR10_CNN.value:
-    assert 0, 'Check Accuracy'
-
-if __name__ == "__main__":
-  print("cifar10 cnn concat")
-  top_level_task()
diff --git a/examples/python/native/demo_gather.py b/examples/python/native/demo_gather.py
deleted file mode 100644
index 2f17e85106..0000000000
--- a/examples/python/native/demo_gather.py
+++ /dev/null
@@ -1,35 +0,0 @@
-from flexflow.core import *
-import numpy as np
-
-def top_level_task():
-  ffconfig = FFConfig()
-  bs = ffconfig.batch_size
-  ffmodel = FFModel(ffconfig)
-  neighbors = [[[0], [5], [3], [3], [7], [9]]]
-  neighbors = np.array(neighbors).repeat(bs, 0).repeat(5, 2)
-  print(neighbors.shape)
-  x = np.array([[[0.01 for i in range(5)] for j in range(16)] for k in range(bs)], np.single)
-  print(x)
-  input = ffmodel.create_tensor([bs, 16, 5], DataType.DT_FLOAT)
-  index = ffmodel.create_tensor([bs, 6, 5], DataType.DT_INT32)
-  x0 = ffmodel.dense(input, 5, ActiMode.AC_MODE_NONE, False)
-  x1 = ffmodel.gather(x0, index, 1)
-
-  ffoptimizer = SGDOptimizer(ffmodel, 0.01)
-  ffmodel.optimizer = ffoptimizer
-  ffmodel.compile(loss_type=LossType.LOSS_MEAN_SQUARED_ERROR_AVG_REDUCE, metrics=[MetricsType.METRICS_MEAN_SQUARED_ERROR])
-  ffmodel.init_layers()
-  input.attach_numpy_array(ffmodel, ffconfig, x)
-  index.attach_numpy_array(ffmodel, ffconfig, neighbors)
-  label_tensor = ffmodel.label_tensor
-  y = np.random.rand(bs, 6, 5).astype('float32')
-  label_tensor.attach_numpy_array(ffmodel, ffconfig, y)
-
-  for _ in range(100):
-    ffmodel.forward()
-    ffmodel.backward()
-    ffmodel.update()
-
-if __name__ == "__main__":
-  print("Demo Gather")
-  top_level_task()
diff --git a/examples/python/native/dlrm.py b/examples/python/native/dlrm.py
deleted file mode 100644
index 83be9f8322..0000000000
--- a/examples/python/native/dlrm.py
+++ /dev/null
@@ -1,17 +0,0 @@
-from flexflow.core import *
-
-def top_level_task():
-  ffconfig = FFConfig()
-  dlrmconfig = DLRMConfig()
-  print("Python API batchSize(%d) workersPerNodes(%d) numNodes(%d)" %(ffconfig.batch_size, ffconfig.workers_per_node, ffconfig.num_nodes))
-  print(dlrmconfig.dataset_path, dlrmconfig.arch_interaction_op)
-  print(dlrmconfig.sparse_feature_size, dlrmconfig.sigmoid_bot, dlrmconfig.sigmoid_top, dlrmconfig.embedding_bag_size, dlrmconfig.loss_threshold)
-  print(dlrmconfig.mlp_bot)
-  print(dlrmconfig.mlp_top)
-  print(dlrmconfig.embedding_size)
-  ffmodel = FFModel(ffconfig)
-
-
-if __name__ == "__main__":
-  print("dlrm")
-  top_level_task()
diff --git a/examples/python/native/inception.py b/examples/python/native/inception.py
deleted file mode 100644
index 1afbc65ced..0000000000
--- a/examples/python/native/inception.py
+++ /dev/null
@@ -1,160 +0,0 @@
-from flexflow.core import *
-from flexflow.keras.datasets import cifar10
-
-from accuracy import ModelAccuracy
-from PIL import Image
-import numpy as np
-
-def InceptionA(ffmodel, input, pool_features):
-  t1 = ffmodel.conv2d(input, 64, 1, 1, 1, 1, 0, 0)
-  t2 = ffmodel.conv2d(input, 48, 1, 1, 1, 1, 0, 0)
-  t2 = ffmodel.conv2d(t2, 64, 5, 5, 1, 1, 2, 2)
-  t3 = ffmodel.conv2d(input, 64, 1, 1, 1, 1, 0, 0)
-  t3 = ffmodel.conv2d(t3, 96, 3, 3, 1, 1, 1, 1)
-  t3 = ffmodel.conv2d(t3, 96, 3, 3, 1, 1, 1, 1)
-  t4 = ffmodel.pool2d(input, 3, 3, 1, 1, 1, 1, PoolType.POOL_AVG)
-  t4 = ffmodel.conv2d(t4, pool_features, 1, 1, 1, 1, 0, 0)
-  output = ffmodel.concat([t1, t2, t3, t4], 1)
-  return output
-
-def InceptionB(ffmodel, input):
-  t1 = ffmodel.conv2d(input, 384, 3, 3, 2, 2, 0, 0)
-  t2 = ffmodel.conv2d(input, 64, 1, 1, 1, 1, 0, 0)
-  t2 = ffmodel.conv2d(t2, 96, 3, 3, 1, 1, 1, 1)
-  t2 = ffmodel.conv2d(t2, 96, 3, 3, 2, 2, 0, 0)
-  t3 = ffmodel.pool2d(input, 3, 3, 2, 2, 0, 0)
-  output = ffmodel.concat([t1, t2, t3], 1)
-  return output
-
-def InceptionC(ffmodel, input, channels):
-  t1 = ffmodel.conv2d(input, 192, 1, 1, 1, 1, 0, 0)
-  t2 = ffmodel.conv2d(input, channels, 1, 1, 1, 1, 0, 0)
-  t2 = ffmodel.conv2d(t2, channels, 1, 7, 1, 1, 0, 3)
-  t2 = ffmodel.conv2d(t2, 192, 7, 1, 1, 1, 3, 0)
-  t3 = ffmodel.conv2d(input, channels, 1, 1, 1, 1, 0, 0)
-  t3 = ffmodel.conv2d(t3, channels, 7, 1, 1, 1, 3, 0)
-  t3 = ffmodel.conv2d(t3, channels, 1, 7, 1, 1, 0, 3)
-  t3 = ffmodel.conv2d(t3, channels, 7, 1, 1, 1, 3, 0)
-  t3 = ffmodel.conv2d(t3, 192, 1, 7, 1, 1, 0, 3)
-  t4 = ffmodel.pool2d(input, 3, 3, 1, 1, 1, 1, PoolType.POOL_AVG)
-  t4 = ffmodel.conv2d(t4, 192, 1, 1, 1, 1, 0, 0)
-  output = ffmodel.concat([t1, t2, t3, t4], 1)
-  return output;
-
-def InceptionD(ffmodel, input):
-  t1 = ffmodel.conv2d(input, 192, 1, 1, 1, 1, 0, 0)
-  t1 = ffmodel.conv2d(t1, 320, 3, 3, 2, 2, 0, 0)
-  t2 = ffmodel.conv2d(input, 192, 1, 1, 1, 1, 0, 0)
-  t2 = ffmodel.conv2d(t2, 192, 1, 7, 1, 1, 0, 3)
-  t2 = ffmodel.conv2d(t2, 192, 7, 1, 1, 1, 3, 0)
-  t2 = ffmodel.conv2d(t2, 192, 3, 3, 2, 2, 0, 0)
-  t3 = ffmodel.pool2d(input, 3, 3, 2, 2, 0, 0)
-  output = ffmodel.concat([t1, t2, t3], 1)
-  return output;
-
-def InceptionE(ffmodel, input):
-  t1 = ffmodel.conv2d(input, 320, 1, 1, 1, 1, 0, 0)
-  t2i = ffmodel.conv2d(input, 384, 1, 1, 1, 1, 0, 0)
-  t2 = ffmodel.conv2d(t2i, 384, 1, 3, 1, 1, 0, 1)
-  t3 = ffmodel.conv2d(t2i, 384, 3, 1, 1, 1, 1, 0)
-  t3i = ffmodel.conv2d(input, 448, 1, 1, 1, 1, 0, 0)
-  t3i = ffmodel.conv2d(t3i, 384, 3, 3, 1, 1, 1, 1)
-  t4 = ffmodel.conv2d(t3i, 384, 1, 3, 1, 1, 0, 1)
-  t5 = ffmodel.conv2d(t3i, 384, 3, 1, 1, 1, 1, 0)
-  t6 = ffmodel.pool2d(input, 3, 3, 1, 1, 1, 1, PoolType.POOL_AVG)
-  t6 = ffmodel.conv2d(t6, 192, 1, 1, 1, 1, 0, 0)
-  output = ffmodel.concat([t1, t2, t3, t4, t5, t6], 1)
-  return output;
-
-def inception():
-  ffconfig = FFConfig()
-  print("Python API batchSize(%d) workersPerNodes(%d) numNodes(%d)" %(ffconfig.batch_size, ffconfig.workers_per_node, ffconfig.num_nodes))
-  ffmodel = FFModel(ffconfig)
-
-  dims_input = [ffconfig.batch_size, 3, 299, 299]
-  #print(dims)
-  input = ffmodel.create_tensor(dims_input, DataType.DT_FLOAT)
-
-
-  t = ffmodel.conv2d(input, 32, 3, 3, 2, 2, 0, 0)
-  t = ffmodel.conv2d(t, 32, 3, 3, 1, 1, 0, 0)
-  t = ffmodel.conv2d(t, 64, 3, 3, 1, 1, 1, 1)
-  t = ffmodel.pool2d(t, 3, 3, 2, 2, 0, 0)
-  t = ffmodel.conv2d(t, 80, 1, 1, 1, 1, 0, 0)
-  t = ffmodel.conv2d(t, 192, 3, 3, 1, 1, 1, 1)
-  t = ffmodel.pool2d(t, 3, 3, 2, 2, 0, 0)
-  t = InceptionA(ffmodel, t, 32)
-  t = InceptionA(ffmodel, t, 64)
-  t = InceptionA(ffmodel, t, 64)
-  t = InceptionB(ffmodel, t)
-  t = InceptionC(ffmodel, t, 128)
-  t = InceptionC(ffmodel, t, 160)
-  t = InceptionC(ffmodel, t, 160)
-  t = InceptionC(ffmodel, t, 192)
-  t = InceptionD(ffmodel, t)
-  t = InceptionE(ffmodel, t)
-  t = InceptionE(ffmodel, t)
-  t = ffmodel.pool2d(t, 8, 8, 1, 1, 0, 0, PoolType.POOL_AVG)
-  t = ffmodel.flat(t)
-  t = ffmodel.dense(t, 10)
-  t = ffmodel.softmax(t)
-
-  ffoptimizer = SGDOptimizer(ffmodel, 0.001)
-  ffmodel.optimizer = ffoptimizer
-  ffmodel.compile(loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
-  label = ffmodel.label_tensor
-
-  num_samples = 10000
-
-  (x_train, y_train), (x_test, y_test) = cifar10.load_data(num_samples)
-
-  full_input_np = np.zeros((num_samples, 3, 299, 299), dtype=np.float32)
-
-  for i in range(0, num_samples):
-    image = x_train[i, :, :, :]
-    image = image.transpose(1, 2, 0)
-    pil_image = Image.fromarray(image)
-    pil_image = pil_image.resize((299,299), Image.NEAREST)
-    image = np.array(pil_image, dtype=np.float32)
-    image = image.transpose(2, 0, 1)
-    full_input_np[i, :, :, :] = image
-
-  full_input_np /= 255
-  print(full_input_np.shape)
-  print(full_input_np.__array_interface__["strides"])
-  print(full_input_np[0,:, :, :])
-
-  y_train = y_train.astype('int32')
-  full_label_np = y_train
-
-  dataloader_input = ffmodel.create_data_loader(input, full_input_np)
-  dataloader_label = ffmodel.create_data_loader(label, full_label_np)
-
-  num_samples = dataloader_input.num_samples
-  assert dataloader_input.num_samples == dataloader_label.num_samples
-
-  ffmodel.init_layers()
-
-  epochs = ffconfig.epochs
-
-  ts_start = ffconfig.get_current_time()
-  
-  ffmodel.fit(x=dataloader_input, y=dataloader_label, epochs=epochs)
-
-  ts_end = ffconfig.get_current_time()
-  run_time = 1e-6 * (ts_end - ts_start);
-  print("epochs %d, ELAPSED TIME = %.4fs, THROUGHPUT = %.2f samples/s\n" %(epochs, run_time, 8192 * epochs / run_time));
-
-  # conv_2d1 = ffmodel.get_layer_by_id(7)
-  # cbias_tensor = conv_2d1.get_weight_tensor()
-  # print(cbias_tensor)
-  # #cbias_tensor = conv_2d1.get_output_tensor()
-  # cbias_tensor.inline_map(ffconfig)
-  # cbias = cbias_tensor.get_array(ffconfig, DataType.DT_FLOAT)
-  # print(cbias.shape)
-  # #print(cbias)
-  # cbias_tensor.inline_unmap(ffconfig)
-
-if __name__ == "__main__":
-  print("inception")
-  inception()
diff --git a/examples/python/native/mnist_cnn.py b/examples/python/native/mnist_cnn.py
deleted file mode 100644
index 6eabbe57db..0000000000
--- a/examples/python/native/mnist_cnn.py
+++ /dev/null
@@ -1,98 +0,0 @@
-# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from flexflow.core import *
-import numpy as np
-from flexflow.keras.datasets import mnist
-
-from accuracy import ModelAccuracy
-import argparse
-
-
-def top_level_task():
-    ffconfig = FFConfig()
-    print("Python API batchSize(%d) workersPerNodes(%d) numNodes(%d)" % (
-        ffconfig.batch_size, ffconfig.workers_per_node, ffconfig.num_nodes))
-    ffmodel = FFModel(ffconfig)
-
-    dims_input = [ffconfig.batch_size, 1, 28, 28]
-    input_tensor = ffmodel.create_tensor(dims_input, DataType.DT_FLOAT)
-
-    num_samples = 60000
-
-    t = ffmodel.conv2d(input_tensor, 32, 3, 3, 1, 1, 1,
-                       1, ActiMode.AC_MODE_RELU, True)
-    t = ffmodel.conv2d(t, 64, 3, 3, 1, 1, 1, 1, ActiMode.AC_MODE_RELU, True)
-    t = ffmodel.pool2d(t, 2, 2, 2, 2, 0, 0)
-    t = ffmodel.flat(t)
-    t = ffmodel.dense(t, 128, ActiMode.AC_MODE_RELU)
-    t = ffmodel.dense(t, 10)
-    t = ffmodel.softmax(t)
-
-    ffoptimizer = SGDOptimizer(ffmodel, 0.01)
-    ffmodel.optimizer = ffoptimizer
-    ffmodel.compile(loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, metrics=[
-                    MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
-    label_tensor = ffmodel.label_tensor
-
-    img_rows, img_cols = 28, 28
-    (x_train, y_train), (x_test, y_test) = mnist.load_data()
-    x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
-    x_train = x_train.astype('float32')
-    x_train /= 255
-    y_train = y_train.astype('int32')
-    y_train = np.reshape(y_train, (len(y_train), 1))
-
-    dataloader_input = ffmodel.create_data_loader(input_tensor, x_train)
-    dataloader_label = ffmodel.create_data_loader(label_tensor, y_train)
-
-    ffmodel.init_layers()
-
-    epochs = ffconfig.epochs
-
-    ts_start = ffconfig.get_current_time()
-
-    ffmodel.fit(x=dataloader_input, y=dataloader_label, epochs=epochs)
-
-    ts_end = ffconfig.get_current_time()
-    run_time = 1e-6 * (ts_end - ts_start)
-    print("epochs %d, ELAPSED TIME = %.4fs, THROUGHPUT = %.2f samples/s\n" %
-          (epochs, run_time, num_samples * epochs / run_time))
-
-    perf_metrics = ffmodel.get_perf_metrics()
-
-    return perf_metrics
-
-
-def test_accuracy():
-    perf_metrics = top_level_task()
-    accuracy = perf_metrics.get_accuracy()
-    try:
-        assert (accuracy >= ModelAccuracy.MNIST_CNN.value), "Accuracy less than 90%"
-    except AssertionError as e:
-        raise
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-a", "--test_acc",
-                        action="store_true", help="Test accuracy flag")
-    args, unknown = parser.parse_known_args()
-    if args.test_acc:
-        print("Testing mnist cnn training accuracy")
-        test_accuracy()
-    else:
-        print("mnist cnn")
-        top_level_task()
diff --git a/examples/python/native/mnist_mlp.py b/examples/python/native/mnist_mlp.py
deleted file mode 100644
index aefe7cfd57..0000000000
--- a/examples/python/native/mnist_mlp.py
+++ /dev/null
@@ -1,84 +0,0 @@
-from flexflow.core import *
-import numpy as np
-from flexflow.keras.datasets import mnist
-
-from accuracy import ModelAccuracy
-import argparse
-
-
-def top_level_task():
-    ffconfig = FFConfig()
-    print("Python API batchSize(%d) workersPerNodes(%d) numNodes(%d)" % (
-        ffconfig.batch_size, ffconfig.workers_per_node, ffconfig.num_nodes))
-    ffmodel = FFModel(ffconfig)
-
-    dims_input = [ffconfig.batch_size, 784]
-    input_tensor = ffmodel.create_tensor(dims_input, DataType.DT_FLOAT)
-
-    num_samples = 60000
-
-    kernel_init = UniformInitializer(12, -1, 1)
-    t = ffmodel.dense(input_tensor, 512, ActiMode.AC_MODE_RELU,
-                      kernel_initializer=kernel_init)
-    t = ffmodel.dense(t, 512, ActiMode.AC_MODE_RELU)
-    t = ffmodel.dense(t, 10)
-
-    t = ffmodel.softmax(t)
-
-    ffoptimizer = SGDOptimizer(ffmodel, 0.01)
-    ffmodel.optimizer = ffoptimizer
-    ffmodel.compile(loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, metrics=[
-                    MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
-    label_tensor = ffmodel.label_tensor
-
-    (x_train, y_train), (x_test, y_test) = mnist.load_data()
-
-    print(x_train.shape)
-    x_train = x_train.reshape(60000, 784)
-    x_train = x_train.astype('float32')
-    x_train /= 255
-    y_train = y_train.astype('int32')
-    y_train = np.reshape(y_train, (len(y_train), 1))
-
-    dataloader_input = ffmodel.create_data_loader(input_tensor, x_train)
-    dataloader_label = ffmodel.create_data_loader(label_tensor, y_train)
-
-    ffmodel.init_layers()
-
-    epochs = ffconfig.epochs
-
-    ts_start = ffconfig.get_current_time()
-
-    ffmodel.fit(x=dataloader_input, y=dataloader_label, epochs=epochs)
-    ffmodel.eval(x=dataloader_input, y=dataloader_label)
-
-    ts_end = ffconfig.get_current_time()
-    run_time = 1e-6 * (ts_end - ts_start)
-    print("epochs %d, ELAPSED TIME = %.4fs, THROUGHPUT = %.2f samples/s\n" %
-          (epochs, run_time, num_samples * epochs / run_time))
-
-    perf_metrics = ffmodel.get_perf_metrics()
-
-    return perf_metrics
-
-
-def test_accuracy():
-    perf_metrics = top_level_task()
-    accuracy = perf_metrics.get_accuracy()
-    try:
-        assert (accuracy >= ModelAccuracy.MNIST_MLP.value), "Accuracy less than 90%"
-    except AssertionError as e:
-        raise
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-a", "--test_acc",
-                        action="store_true", help="Test accuracy flag")
-    args, unknown = parser.parse_known_args()
-    if args.test_acc:
-        print("Testing mnist mlp training accuracy")
-        test_accuracy()
-    else:
-        print("mnist mlp")
-        top_level_task()
diff --git a/examples/python/native/mnist_mlp_attach.py b/examples/python/native/mnist_mlp_attach.py
deleted file mode 100644
index 6e7c8f8405..0000000000
--- a/examples/python/native/mnist_mlp_attach.py
+++ /dev/null
@@ -1,137 +0,0 @@
-# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from flexflow.core import *
-import numpy as np
-from flexflow.keras.datasets import mnist
-from accuracy import ModelAccuracy
-
-def next_batch(idx, x_train, input1, ffconfig, ffmodel):
-  start = idx*ffconfig.batch_size
-  x_train_batch = x_train[start:start+ffconfig.batch_size, :]
-
-  # input1.inline_map(ffconfig)
-  # input_array = input1.get_array(ffconfig, DataType.DT_FLOAT)
-  # print(input_array.shape)
-  # for i in range(0, ffconfig.batch_size):
-  #   for j in range(0, 784):
-  #     input_array[i][j] = x_train_batch[i][j]
-  # input1.inline_unmap(ffconfig)
-  #TODO: test set tensor
-  input1.set_tensor(ffmodel, x_train_batch)
-
-def next_batch_label(idx, x_train, input1, ffconfig, ffmodel):
-  start = idx*ffconfig.batch_size
-  x_train_batch = x_train[start:start+ffconfig.batch_size, :]
-
-  # input1.inline_map(ffconfig)
-  # input_array = input1.get_array(ffconfig, DataType.DT_INT32)
-  # print(input_array.shape)
-  # for i in range(0, ffconfig.batch_size):
-  #   for j in range(0, 1):
-  #     input_array[i][j] = x_train_batch[i][j]
-  # input1.inline_unmap(ffconfig)
-  #
-  input1.set_tensor(ffmodel, x_train_batch)
-  # x_batch = input1.get_tensor(ffmodel, CommType.PS)
-  # print(x_batch)
-  # print(x_train_batch)
-  # assert 0
-
-
-def top_level_task():
-  alexnetconfig = NetConfig()
-  ffconfig = FFConfig()
-  print("Python API batchSize(%d) workersPerNodes(%d) numNodes(%d)" %(ffconfig.batch_size, ffconfig.workers_per_node, ffconfig.num_nodes))
-  ffmodel = FFModel(ffconfig)
-
-  dims_input = [ffconfig.batch_size, 784]
-  input_tensor = ffmodel.create_tensor(dims_input, DataType.DT_FLOAT);
-
-  num_samples = 60000
-
-  (x_train, y_train), (x_test, y_test) = mnist.load_data()
-
-  print(x_train.shape)
-  x_train = x_train.reshape(60000, 784)
-  x_train = x_train.astype('float32')
-  x_train /= 255
-  y_train = y_train.astype('int32')
-  y_train = np.reshape(y_train, (len(y_train), 1))
-  print(x_train.shape[0], 'train samples')
-  print(y_train.shape)
-
-  t2 = ffmodel.dense(input_tensor, 512, ActiMode.AC_MODE_RELU)
-  t3 = ffmodel.dense(t2, 512, ActiMode.AC_MODE_RELU)
-  t4 = ffmodel.dense(t3, 10)
-  t5 = ffmodel.softmax(t4)
-
-  ffoptimizer = SGDOptimizer(ffmodel, 0.01)
-  ffmodel.optimizer = ffoptimizer
-  ffmodel.compile(loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
-  label_tensor = ffmodel.label_tensor
-
-  next_batch(0, x_train, input_tensor, ffconfig, ffmodel)
-  next_batch_label(0, y_train, label_tensor, ffconfig, ffmodel)
-
-  ffmodel.init_layers()
-
-  epochs = ffconfig.epochs
-
-  ts_start = ffconfig.get_current_time()
-  for epoch in range(0,epochs):
-    ct = 0
-    ffmodel.reset_metrics()
-    iterations = num_samples / ffconfig.batch_size
-    for iter in range(0, int(iterations)):
-      #ffconfig.begin_trace(111)
-      next_batch(ct, x_train, input_tensor, ffconfig, ffmodel)
-      next_batch_label(ct, y_train, label_tensor, ffconfig, ffmodel)
-      ct += 1
-      ffmodel.forward()
-      ffmodel.zero_gradients()
-      ffmodel.backward()
-      ffmodel.update()
-      #ffconfig.end_trace(111)
-
-  ts_end = ffconfig.get_current_time()
-  run_time = 1e-6 * (ts_end - ts_start);
-  print("epochs %d, ELAPSED TIME = %.4fs, THROUGHPUT = %.2f samples/s\n" %(epochs, run_time, num_samples * epochs / run_time));
-
-  perf_metrics = ffmodel.get_perf_metrics()
-  accuracy = perf_metrics.get_accuracy()
-  if accuracy < 65:
-    assert 0, 'Check Accuracy'
-
-  dense1 = ffmodel.get_layer_by_id(0)
-
-  dbias_tensor = label_tensor#dense1.get_bias_tensor()
-  dbias_tensor.inline_map(ffmodel, ffconfig)
-  dbias = dbias_tensor.get_array(ffmodel, ffconfig)
-  print(dbias.shape)
-  print(dbias)
-  dbias_tensor.inline_unmap(ffmodel, ffconfig)
-
-  # dweight_tensor = dense1.get_output_tensor()
-  # dweight_tensor.inline_map(ffconfig)
-  # dweight = dweight_tensor.get_array(ffconfig, DataType.DT_FLOAT)
-  # print(dweight.shape)
-  # print(dweight)
-  # dweight_tensor.inline_unmap(ffconfig)
-
-
-if __name__ == "__main__":
-  print("mnist mlp attach")
-  top_level_task()
diff --git a/examples/python/native/multi_head_attention.py b/examples/python/native/multi_head_attention.py
deleted file mode 100644
index 7ccdae4186..0000000000
--- a/examples/python/native/multi_head_attention.py
+++ /dev/null
@@ -1,74 +0,0 @@
-from flexflow.core import *
-from argparse import ArgumentParser
-import numpy as np
-
-def parse_args():
-  parser = ArgumentParser()
-  parser.add_argument('--seq-length', default=256, type=int)
-  parser.add_argument('--num-heads', default=16, type=int)
-  parser.add_argument('--hidden-size', default=512, type=int)
-  args, unknown = parser.parse_known_args()
-  return args
-
-def attention():
-  args = parse_args()
-  ffconfig = FFConfig()
-  print("Python API: batch_size(%d) GPUs/node(%d) nodes(%d)" %(ffconfig.batch_size, ffconfig.workers_per_node, ffconfig.num_nodes))
-  ffmodel = FFModel(ffconfig)
-  batch_size = ffconfig.batch_size
-  dims_input = [batch_size, args.seq_length, args.hidden_size]
-  input = ffmodel.create_tensor(dims_input, DataType.DT_FLOAT)
-  q = ffmodel.dense(input, args.hidden_size)
-  k = ffmodel.dense(input, args.hidden_size)
-  v = ffmodel.dense(input, args.hidden_size)
-  
-  q = ffmodel.reshape(q, shape=(batch_size, args.seq_length, args.num_heads, args.hidden_size // args.num_heads))
-  k = ffmodel.reshape(k, shape=(batch_size, args.seq_length, args.num_heads, args.hidden_size // args.num_heads))
-  v = ffmodel.reshape(v, shape=(batch_size, args.seq_length, args.num_heads, args.hidden_size // args.num_heads))
-  q = ffmodel.transpose(q, perm=(0, 2, 1, 3))
-  k = ffmodel.transpose(k, perm=(0, 2, 3, 1))
-  v = ffmodel.transpose(v, perm=(0, 2, 1, 3))
-  logits = ffmodel.batch_matmul(q, k)
-  #logits = ffmodel.softmax(logits)
-  output = ffmodel.batch_matmul(logits, v)
-  output = ffmodel.transpose(output, perm=(0, 2, 1, 3))
-  output = ffmodel.reshape(output, shape=(batch_size, args.seq_length, args.hidden_size))
-  output = ffmodel.dense(output, args.hidden_size, ActiMode.AC_MODE_RELU)
-  output = ffmodel.dense(output, args.hidden_size)
-  ffoptimizer = SGDOptimizer(ffmodel)
-  ffmodel.optimizer = ffoptimizer
-  ffmodel.compile(loss_type=LossType.LOSS_MEAN_SQUARED_ERROR_AVG_REDUCE, metrics=[MetricsType.METRICS_MEAN_SQUARED_ERROR], comp_mode=CompMode.INFERENCE)
-  label_tensor = ffmodel.label_tensor
-
-  # Full inputs/label
-  dims = [batch_size * 10, args.seq_length, args.hidden_size]
-  np_input = np.zeros(dims, dtype=np.float32)
-  np_label = np.zeros(dims, dtype=np.float32)
-
-  dl_input = ffmodel.create_data_loader(input, np_input)
-  dl_label = ffmodel.create_data_loader(label, np_label)
-
-  ffmodel.init_layers()
-  epochs = ffconfig.epochs
-
-  dl_input.next_batch(ffmodel)
-  dl_label.next_batch(ffmodel)
-
-  ts_start = ffconfig.get_current_time()
-  for epoch in range(0, epochs):
-    ffmodel.reset_metrics()
-    iterations = num_samples // batch_size
-    for iter in range(0, iterations):
-      ffconfig.begin_trace(111)
-      ffmodel.forward()
-      ffmodel.zero_gradients()
-      ffmodel.backward()
-      ffmodel.update()
-      ffconfig.end_trace(111)
-  ts_end = ffconfig.get_current_time()
-  run_time = 1e-6 * (ts_end - ts_start)
-  print("EPOCHS %d, ELAPSED TIME = %.4fs, THROUGHPUT = %.2f samples/s\n" %(epochs, run_time, num_samples * epochs / run_time));
-
-if __name__ == "__main__":
-  print("Attention")
-  attention()
diff --git a/examples/python/native/print_input.py b/examples/python/native/print_input.py
deleted file mode 100644
index 84d1c2a7ce..0000000000
--- a/examples/python/native/print_input.py
+++ /dev/null
@@ -1,90 +0,0 @@
-from flexflow.core import *
-
-import numpy as np
-
-def top_level_task():
-  ffconfig = FFConfig()
-  print("Python API batchSize(%d) workersPerNodes(%d) numNodes(%d)" %(ffconfig.batch_size, ffconfig.workers_per_node, ffconfig.num_nodes))
-  ffmodel = FFModel(ffconfig)
-
-  dims1 = [ffconfig.batch_size, 3, 229, 229]
-  input1 = ffmodel.create_tensor(dims1,  DataType.DT_FLOAT);
-
-  dims2 = [ffconfig.batch_size, 256]
-  input2 = ffmodel.create_tensor(dims2, DataType.DT_FLOAT);
-
-  dims_label = [ffconfig.batch_size, 1]
-  label = ffmodel.create_tensor(dims_label, DataType.DT_INT32);
-
-  # alexnetconfig = NetConfig()
-  # dataloader = DataLoader4D(ffmodel, input1, label, ffnetconfig=alexnetconfig)
-  # dataloader.reset()
-  # dataloader.next_batch(ffmodel)
-
-  input1.inline_map(ffconfig)
-  input1_array = input1.get_array(ffconfig)
-  rawptr = input1_array.__array_interface__['data']
-  print(hex(rawptr[0]))
-  # input1_array *= 0
-  # input1_array += 1.1
-  print(input1_array.shape)
-  print(input1_array)
-  input1.inline_unmap(ffconfig)
-
-  input2.inline_map(ffconfig)
-  input2_array = input2.get_array(ffconfig)
-  rawptr = input2_array.__array_interface__['data']
-  print(hex(rawptr[0]))
-  input2_array *= 0
-  input2_array += 2.2
-  print(input2_array.shape)
-  print(input2_array)
-  rawptr = input2_array.__array_interface__['data']
-  print(hex(rawptr[0]))
-  input2.inline_unmap(ffconfig)
-
-  input1 = ffmodel.conv2d(input1, 64, 11, 11, 4, 4, 2, 2)
-  input2 = ffmodel.dense(input2, 128, ActiMode.AC_MODE_RELU)
-  input2 = ffmodel.dense(input2, 128, ActiMode.AC_MODE_RELU)
-  #
-  #
-  # ffmodel.init_layers()
-  #
-  # conv1 = ffmodel.get_layer_by_id(0)
-  # input_tensor1 = conv1.get_input_tensor()
-  # input_tensor1.inline_map(ffconfig)
-  # input_array11 = input_tensor1.get_array(ffconfig)
-  # print(input_array11.shape)
-  # #print(input_array11)
-  # input_tensor1.inline_unmap(ffconfig)
-  #
-  # output_tensor1 = conv1.get_output_tensor()
-  # output_tensor1.inline_map(ffconfig)
-  # output_array11 = output_tensor1.get_array(ffconfig)
-  # print(output_array11.shape)
-  # #print(output_array11)
-  # output_tensor1.inline_unmap(ffconfig)
-
-
-  dense1 = ffmodel.get_layer_by_id(1)
-  if flexflow_python_binding() == "cffi":
-    input_tensor2 = dense1.get_input_tensor()
-  else:
-    input_tensor2 = dense1.get_input_tensor_by_id(0)
-  input_tensor2.inline_map(ffconfig)
-  input_array22 = input_tensor2.get_array(ffconfig)
-  print(input_array22.shape)
-  print(input_array22)
-  input_tensor2.inline_unmap(ffconfig)
-
-  # output_tensor2 = dense1.get_output_tensor()
-  # output_tensor2.inline_map(ffconfig)
-  # output_array22 = output_tensor2.get_array(ffconfig)
-  # print(output_array22.shape)
-  # #print(output_array11)
-  # output_tensor2.inline_unmap(ffconfig)
-
-
-if __name__ == "__main__":
-  print("alexnet")
-  top_level_task()
diff --git a/examples/python/native/print_layers.py b/examples/python/native/print_layers.py
deleted file mode 100644
index 22b87e0b86..0000000000
--- a/examples/python/native/print_layers.py
+++ /dev/null
@@ -1,124 +0,0 @@
-from flexflow.core import *
-
-import numpy as np
-
-def top_level_task():
-  ffconfig = FFConfig()
-  print("Python API batchSize(%d) workersPerNodes(%d) numNodes(%d)" %(ffconfig.batch_size, ffconfig.workers_per_node, ffconfig.num_nodes))
-  ffmodel = FFModel(ffconfig)
-
-  dims1 = [ffconfig.batch_size, 3, 229, 229]
-  input1 = ffmodel.create_tensor(dims1, DataType.DT_FLOAT);
-
-  dims2 = [ffconfig.batch_size, 16]
-  input2 = ffmodel.create_tensor(dims2, DataType.DT_FLOAT);
-
-  dims_label = [ffconfig.batch_size, 1]
-  label = ffmodel.create_tensor(dims_label, DataType.DT_INT32);
-
-  t1 = ffmodel.conv2d(input1, 64, 11, 11, 4, 4, 2, 2)
-  t2 = ffmodel.dense(input2, 8, ActiMode.AC_MODE_RELU)
-  #t3 = ffmodel.dense("dense1", t2, 128, ActiMode.AC_MODE_RELU)
-  ffoptimizer = SGDOptimizer(ffmodel, 0.01)
-  ffmodel.compile(optimizer=ffoptimizer, loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
-  label = ffmodel.label_tensor
-
-  # Data Loader
-  # alexnetconfig = NetConfig()
-  # dataloader = DataLoader4D(ffmodel, input1, label, ffnetconfig=alexnetconfig)
-
-  # ffmodel.init_layers()
-
-  label.inline_map(ffmodel, ffconfig)
-  label_array = label.get_array(ffmodel, ffconfig)
-  label_array *= 0
-  label_array += 1
-  print(label_array.shape)
-  print(label_array)
-  label.inline_unmap(ffmodel, ffconfig)
-
-  #weight of conv2d
-  # t3 = ffmodel.get_tensor_by_id(0)
-  #
-  # np_array = np.zeros((64, 3, 11, 11), dtype=np.float32)
-  # np_array += 1.2
-  # t3.set_weights(ffmodel, np_array)
-  #
-  # t3.inline_map(ffconfig)
-  # t3_array = t3.get_array(ffconfig)
-  # print(t3_array.shape)
-  # print(t3_array)
-  # t3.inline_unmap(ffconfig)
-
-
-  ###3
-  conv_2d1 = ffmodel.get_layer_by_id(0)
-
-  cbias_tensor = conv_2d1.get_bias_tensor()
-
-  np_array = np.zeros((64), dtype=np.float32)
-  np_array += 22.222
-  cbias_tensor.set_weights(ffmodel, np_array)
-
-  # cbias_tensor.inline_map(ffconfig)
-  # cbias = cbias_tensor.get_array(ffconfig)
-  # print(cbias)
-  # cbias *= 0.0
-  # cbias += 1.1
-  # print(cbias.shape)
-  # print(cbias)
-  # cbias_tensor.inline_unmap(ffconfig)
-  #
-  # cweight_tensor = conv_2d1.get_weight_tensor()
-  # cweight_tensor.inline_map(ffconfig)
-  # cweight = cweight_tensor.get_array(ffconfig)
-  # #cweight += 1.2
-  # ct = 0.0
-  # for i in range(cweight.shape[0]):
-  #   for j in range(cweight.shape[1]):
-  #     for k in range(cweight.shape[2]):
-  #       for l in range(cweight.shape[3]):
-  #         cweight[i][j][k][l] += ct
-  #         ct += 1.0
-  # print(cweight.shape)
-  # # print(cweight.strides)
-  # print(cweight)
-  # cweight_tensor.inline_unmap(ffconfig)
-  #
-  # np_array = cweight_tensor.get_weights(ffmodel)
-  # print(np_array)
-
-  dense1 = ffmodel.get_layer_by_id(1)
-
-  dbias_tensor = dense1.get_bias_tensor()
-  dbias_tensor.inline_map(ffmodel, ffconfig)
-  dbias = dbias_tensor.get_array(ffmodel, ffconfig)
-  dbias *= 0.0
-  dbias += 2.1
-  print(dbias.shape)
-  print(dbias)
-  dbias_tensor.inline_unmap(ffmodel, ffconfig)
-
-  np_array = dbias_tensor.get_weights(ffmodel)
-  print(np_array)
-
-  dweight_tensor = dense1.get_weight_tensor()
-  dweight_tensor.inline_map(ffmodel, ffconfig)
-  dweight = dweight_tensor.get_array(ffmodel, ffconfig)
-  #dweight *= 0.0
-  #dweight += 2.2
-  ct = 0.0
-  # for i in range(dweight.shape[0]):
-  #   for j in range(dweight.shape[1]):
-  #     dweight[i][j] = ct
-  #     ct += 1.0
-  # print(dweight.shape)
-  # print(dweight.strides)
-  # print(dweight)
-  dweight_tensor.inline_unmap(ffmodel, ffconfig)
-
-  # ffmodel.print_layers(0)
-
-if __name__ == "__main__":
-  print("alexnet")
-  top_level_task()
diff --git a/examples/python/native/print_weight.py b/examples/python/native/print_weight.py
deleted file mode 100644
index 5c5e4bc556..0000000000
--- a/examples/python/native/print_weight.py
+++ /dev/null
@@ -1,54 +0,0 @@
-from flexflow.core import *
-import numpy as np
-from flexflow.keras.datasets import mnist
-
-from accuracy import ModelAccuracy
-import argparse
-
-
-def top_level_task():
-    ffconfig = FFConfig()
-    print("Python API batchSize(%d) workersPerNodes(%d) numNodes(%d)" % (
-        ffconfig.batch_size, ffconfig.workers_per_node, ffconfig.num_nodes))
-    ffmodel = FFModel(ffconfig)
-
-    dims_input = [ffconfig.batch_size, 784]
-    input_tensor = ffmodel.create_tensor(dims_input, DataType.DT_FLOAT)
-
-    num_samples = 60000
-
-    kernel_init = UniformInitializer(12, -1, 1)
-    t = ffmodel.dense(input_tensor, 512, ActiMode.AC_MODE_RELU,
-                      kernel_initializer=kernel_init)
-    t = ffmodel.dense(t, 512, ActiMode.AC_MODE_RELU)
-    t = ffmodel.dense(t, 10)
-
-    t = ffmodel.softmax(t)
-
-    ffoptimizer = SGDOptimizer(ffmodel, 0.01)
-    ffmodel.optimizer = ffoptimizer
-    ffmodel.compile(loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, metrics=[
-                    MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
-    label_tensor = ffmodel.label_tensor
-
-    (x_train, y_train), (x_test, y_test) = mnist.load_data()
-
-    print(x_train.shape)
-    x_train = x_train.reshape(60000, 784)
-    x_train = x_train.astype('float32')
-    x_train /= 255
-    y_train = y_train.astype('int32')
-    y_train = np.reshape(y_train, (len(y_train), 1))
-
-    dataloader_input = ffmodel.create_data_loader(input_tensor, x_train)
-    dataloader_label = ffmodel.create_data_loader(label_tensor, y_train)
-
-    ffmodel.init_layers()
-
-    dense1 = ffmodel.get_layer_by_id(0)
-    print(dense1)
-    print(dense1.get_weight_tensor())
-
-if __name__ == "__main__":
-    print("mnist mlp test weight")
-    top_level_task()
\ No newline at end of file
diff --git a/examples/python/native/resnet.py b/examples/python/native/resnet.py
deleted file mode 100644
index f30fdf41ba..0000000000
--- a/examples/python/native/resnet.py
+++ /dev/null
@@ -1,109 +0,0 @@
-from flexflow.core import *
-from flexflow.keras.datasets import cifar10
-
-from PIL import Image
-import numpy as np
-
-def BottleneckBlock(ff, input, out_channels, stride):
-  t = ff.conv2d(input, out_channels, 1, 1, 1, 1, 0, 0, ActiMode.AC_MODE_NONE)
-  t = ff.batch_norm(t)
-  t = ff.conv2d(t, out_channels, 3, 3, stride, stride, 1, 1, ActiMode.AC_MODE_NONE)
-  t = ff.batch_norm(t)
-  t = ff.conv2d(t, 4*out_channels, 1, 1, 1, 1, 0, 0)
-  t = ff.batch_norm(t, False)
-  if ((stride > 1) or (input.dims[1] != out_channels * 4)):
-    print("input.adim = %d out_channels*4 = %d" %(input.dims[1], out_channels*4))
-    input = ff.conv2d(input, 4*out_channels, 1, 1, stride, stride, 0, 0, ActiMode.AC_MODE_NONE)
-    input = ff.batch_norm(input, False);
-  t = ff.add(input, t)
-  return ff.relu(t)
-
-def top_level_task():
-  ffconfig = FFConfig()
-  alexnetconfig = NetConfig()
-  print(alexnetconfig.dataset_path)
-  print("Python API batchSize(%d) workersPerNodes(%d) numNodes(%d)" %(ffconfig.batch_size, ffconfig.workers_per_node, ffconfig.num_nodes))
-  ffmodel = FFModel(ffconfig)
-
-  dims_input = [ffconfig.batch_size, 3, 229, 229]
-  input = ffmodel.create_tensor(dims_input, DataType.DT_FLOAT)
-
-  t = ffmodel.conv2d(input, 64, 7, 7, 2, 2, 3, 3)
-  t = ffmodel.batch_norm(t);
-  t = ffmodel.pool2d(t, 3, 3, 2, 2, 1, 1)
-  for i in range(0, 3):
-    t = BottleneckBlock(ffmodel, t, 64, 1)
-  for i in range(0, 4):
-    if (i == 0):
-      stride = 2
-    else:
-      stride = 1
-    t = BottleneckBlock(ffmodel, t, 128, stride)
-  for i in range(0, 6):
-    if (i == 0):
-      stride = 2
-    else:
-      stride = 1
-    t = BottleneckBlock(ffmodel, t, 256, stride)
-  for i in range(0, 3):
-    if (i == 0):
-      stride = 2
-    else:
-      stride = 1
-    t = BottleneckBlock(ffmodel, t, 512, stride);
-  t = ffmodel.pool2d(t, 7, 7, 1, 1, 0, 0, PoolType.POOL_AVG)
-  t = ffmodel.flat(t);
-  t = ffmodel.dense(t, 10)
-  t = ffmodel.softmax(t)
-
-  ffoptimizer = SGDOptimizer(ffmodel, 0.001)
-  ffmodel.optimizer = ffoptimizer
-  ffmodel.compile(loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
-  label = ffmodel.label_tensor
-
-  # load data
-  num_samples = 10000
-
-  (x_train, y_train), (x_test, y_test) = cifar10.load_data(num_samples)
-
-  full_input_np = np.zeros((num_samples, 3, 229, 229), dtype=np.float32)
-
-  for i in range(0, num_samples):
-    image = x_train[i, :, :, :]
-    image = image.transpose(1, 2, 0)
-    pil_image = Image.fromarray(image)
-    pil_image = pil_image.resize((229,229), Image.NEAREST)
-    image = np.array(pil_image, dtype=np.float32)
-    image = image.transpose(2, 0, 1)
-    full_input_np[i, :, :, :] = image
-
-
-  full_input_np /= 255
-  print(full_input_np.shape)
-  print(full_input_np.__array_interface__["strides"])
-  print(full_input_np[0,:, :, :])
-
-  y_train = y_train.astype('int32')
-  full_label_np = y_train
-
-  dataloader_input = ffmodel.create_data_loader(input, full_input_np)
-  dataloader_label = ffmodel.create_data_loader(label, full_label_np)
-
-  num_samples = dataloader_input.num_samples
-  assert dataloader_input.num_samples == dataloader_label.num_samples
-
-  ffmodel.init_layers()
-
-  epochs = ffconfig.epochs
-
-  ts_start = ffconfig.get_current_time()
-
-  ffmodel.fit(x=dataloader_input, y=dataloader_label, epochs=epochs)
-
-  ts_end = ffconfig.get_current_time()
-  run_time = 1e-6 * (ts_end - ts_start);
-  print("epochs %d, ELAPSED TIME = %.4fs, THROUGHPUT = %.2f samples/s\n" %(epochs, run_time, num_samples * epochs / run_time));
-
-if __name__ == "__main__":
-  print("resnet")
-  top_level_task()
diff --git a/examples/python/native/split.py b/examples/python/native/split.py
deleted file mode 100644
index dfd8b0e572..0000000000
--- a/examples/python/native/split.py
+++ /dev/null
@@ -1,82 +0,0 @@
-from flexflow.core import *
-from flexflow.keras.datasets import cifar10
-
-from accuracy import ModelAccuracy
-
-def top_level_task():
-  ffconfig = FFConfig()
-  alexnetconfig = NetConfig()
-  print(alexnetconfig.dataset_path)
-  print("Python API batchSize(%d) workersPerNodes(%d) numNodes(%d)" %(ffconfig.batch_size, ffconfig.workers_per_node, ffconfig.num_nodes))
-  ffmodel = FFModel(ffconfig)
-
-  dims_input = [ffconfig.batch_size, 3, 32, 32]
-  input_tensor = ffmodel.create_tensor(dims_input, DataType.DT_FLOAT)
-
-  t1 = ffmodel.conv2d(input_tensor, 32, 3, 3, 1, 1, 1, 1, ActiMode.AC_MODE_RELU)
-  t2 = ffmodel.conv2d(input_tensor, 32, 3, 3, 1, 1, 1, 1, ActiMode.AC_MODE_RELU)
-  t3 = ffmodel.conv2d(input_tensor, 32, 3, 3, 1, 1, 1, 1, ActiMode.AC_MODE_RELU)
-  t = ffmodel.concat([t1, t2, t3], 1)
-  ts = ffmodel.split(t, 3, 1)
-  t = ffmodel.conv2d(ts[1], 32, 3, 3, 1, 1, 1, 1, ActiMode.AC_MODE_RELU)
-  t = ffmodel.pool2d(t, 2, 2, 2, 2, 0, 0,)
-  t = ffmodel.conv2d(t, 64, 3, 3, 1, 1, 1, 1, ActiMode.AC_MODE_RELU)
-  t = ffmodel.conv2d(t, 64, 3, 3, 1, 1, 1, 1, ActiMode.AC_MODE_RELU)
-  t = ffmodel.pool2d(t, 2, 2, 2, 2, 0, 0)
-  t = ffmodel.flat(t);
-  t = ffmodel.dense(t, 512, ActiMode.AC_MODE_RELU)
-  t = ffmodel.dense(t, 10)
-  t = ffmodel.softmax(t)
-
-  ffoptimizer = SGDOptimizer(ffmodel, 0.01)
-  ffmodel.optimizer = ffoptimizer
-  ffmodel.compile(loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
-  label_tensor = ffmodel.label_tensor
-
-  num_samples = 10000
-
-  (x_train, y_train), (x_test, y_test) = cifar10.load_data(num_samples)
-
-  x_train = x_train.astype('float32')
-  x_train /= 255
-  full_input_array = x_train
-  print(full_input_array.__array_interface__["strides"])
-
-  y_train = y_train.astype('int32')
-  full_label_array = y_train
-
-  print(full_input_array.__array_interface__["strides"])
-  print(full_input_array.shape, full_label_array.shape)
-  #print(full_input_array[0,:,:,:])
-  #print(full_label_array[0, 0:64])
-  print(full_label_array.__array_interface__["strides"])
-
-  dataloader_input = ffmodel.create_data_loader(input_tensor, full_input_array)
-  dataloader_label = ffmodel.create_data_loader(label_tensor, full_label_array)
-
-  num_samples = dataloader_input.num_samples
-
-  ffmodel.init_layers()
-
-  if flexflow_python_binding() == 'cffi':
-    print("end init model", ts[0].handle.impl)
-
-  epochs = ffconfig.epochs
-  #epochs = 10
-
-  ts_start = ffconfig.get_current_time()
-
-  ffmodel.fit(x=dataloader_input, y=dataloader_label, epochs=epochs)
-
-  ts_end = ffconfig.get_current_time()
-  run_time = 1e-6 * (ts_end - ts_start);
-  print("epochs %d, ELAPSED TIME = %.4fs, THROUGHPUT = %.2f samples/s\n" %(epochs, run_time, num_samples * epochs / run_time));
-
-  # perf_metrics = ffmodel.get_perf_metrics()
-  # accuracy = perf_metrics.get_accuracy()
-  # if accuracy < ModelAccuracy.CIFAR10_CNN.value:
-  #   assert 0, 'Check Accuracy'
-
-if __name__ == "__main__":
-  print("cifar10 cnn split")
-  top_level_task()
diff --git a/examples/python/native/tensor_attach.py b/examples/python/native/tensor_attach.py
deleted file mode 100644
index 93bbb34379..0000000000
--- a/examples/python/native/tensor_attach.py
+++ /dev/null
@@ -1,34 +0,0 @@
-from flexflow.core import *
-import numpy as np
-
-def top_level_task():
-  ffconfig = FFConfig()
-  alexnetconfig = NetConfig()
-  print(alexnetconfig.dataset_path)
-  print("Python API batchSize(%d) workersPerNodes(%d) numNodes(%d)" %(ffconfig.batch_size, ffconfig.workers_per_node, ffconfig.num_nodes))
-  ffmodel = FFModel(ffconfig)
-
-  dims_input = [8, 3, 10, 10]
-  #print(dims)
-  input = ffmodel.create_tensor(dims_input, DataType.DT_FLOAT)
-
-  input_np = np.zeros((10,10,3,8), dtype=np.float32)
-  ct = 0.0
-  for i in range(0, input_np.shape[0]):
-    for j in range(0, input_np.shape[1]):
-      for k in range(0, input_np.shape[2]):
-        for l in range(0, input_np.shape[3]):
-          input_np[i, j, k, l] = ct
-          ct += 1
-  print(input_np)
-
-  input.attach_numpy_array(ffconfig, input_np)
-  print(input.is_mapped())
-  input_array = input.get_array(ffconfig, DataType.DT_FLOAT)
-  print(input_array)
-  input.detach_numpy_array(ffconfig)
-
-
-if __name__ == "__main__":
-  print("tensor attach")
-  top_level_task()
diff --git a/examples/python/onnx/accuracy.py b/examples/python/onnx/accuracy.py
deleted file mode 100644
index 30b15402f4..0000000000
--- a/examples/python/onnx/accuracy.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from enum import Enum
-
-class ModelAccuracy(Enum):
-  MNIST_MLP = 90
-  MNIST_CNN = 90
-  REUTERS_MLP = 90
-  CIFAR10_CNN = 90
-  CIFAR10_ALEXNET = 90
diff --git a/examples/python/onnx/alexnet.py b/examples/python/onnx/alexnet.py
deleted file mode 100644
index 172dee7dc0..0000000000
--- a/examples/python/onnx/alexnet.py
+++ /dev/null
@@ -1,71 +0,0 @@
-from flexflow.core import *
-from flexflow.keras.datasets import cifar10
-from flexflow.onnx.model import ONNXModel
-
-from accuracy import ModelAccuracy
-from PIL import Image
-import numpy as np
-
-def top_level_task():
-  ffconfig = FFConfig()
-  alexnetconfig = NetConfig()
-  print(alexnetconfig.dataset_path)
-  print("Python API batchSize(%d) workersPerNodes(%d) numNodes(%d)" %(ffconfig.batch_size, ffconfig.workers_per_node, ffconfig.num_nodes))
-  ffmodel = FFModel(ffconfig)
-  
-  dims_input = [ffconfig.batch_size, 3, 229, 229]
-  input = ffmodel.create_tensor(dims_input, DataType.DT_FLOAT)
-
-  onnx_model = ONNXModel("alexnet.onnx")
-  t = onnx_model.apply(ffmodel, {"input.1": input})
-
-  ffoptimizer = SGDOptimizer(ffmodel, 0.01)
-  ffmodel.optimizer = ffoptimizer
-  ffmodel.compile(loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
-  label = ffmodel.label_tensor
-  
-  num_samples = 10000
-  
-  (x_train, y_train), (x_test, y_test) = cifar10.load_data(num_samples)
-
-  full_input_np = np.zeros((num_samples, 3, 229, 229), dtype=np.float32)
-  
-  for i in range(0, num_samples):
-    image = x_train[i, :, :, :]
-    image = image.transpose(1, 2, 0)
-    pil_image = Image.fromarray(image)
-    pil_image = pil_image.resize((229,229), Image.NEAREST)
-    image = np.array(pil_image, dtype=np.float32)
-    image = image.transpose(2, 0, 1)
-    full_input_np[i, :, :, :] = image
-  
-  full_input_np /= 255
-  
-  y_train = y_train.astype('int32')
-  full_label_np = y_train
-  
-  dataloader_input = ffmodel.create_data_loader(input, full_input_np)
-  dataloader_label = ffmodel.create_data_loader(label, full_label_np)
-
-  num_samples = dataloader_input.num_samples
-  assert dataloader_input.num_samples == dataloader_label.num_samples
-
-  ffmodel.init_layers()
-
-  epochs = ffconfig.epochs
-
-  ts_start = ffconfig.get_current_time()
-  
-  ffmodel.fit(x=dataloader_input, y=dataloader_label, epochs=epochs)
-
-  ts_end = ffconfig.get_current_time()
-  run_time = 1e-6 * (ts_end - ts_start);
-  print("epochs %d, ELAPSED TIME = %.4fs, THROUGHPUT = %.2f samples/s\n" %(epochs, run_time, num_samples * epochs / run_time));
-  perf_metrics = ffmodel.get_perf_metrics()
-  accuracy = perf_metrics.get_accuracy()
-  if accuracy < ModelAccuracy.CIFAR10_ALEXNET.value:
-    assert 0, 'Check Accuracy'
-
-if __name__ == "__main__":
-  print("alexnet onnx")
-  top_level_task()
diff --git a/examples/python/onnx/alexnet_pt.py b/examples/python/onnx/alexnet_pt.py
deleted file mode 100644
index e08562f751..0000000000
--- a/examples/python/onnx/alexnet_pt.py
+++ /dev/null
@@ -1,41 +0,0 @@
-import onnx
-import torch
-import torch.nn as nn
-from torch.onnx import TrainingMode
-
-class AlexNet(nn.Module):
-    def __init__(self, num_classes: int = 1000) -> None:
-        super(AlexNet, self).__init__()
-        self.features = nn.Sequential(
-            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
-            nn.ReLU(inplace=True),
-            nn.MaxPool2d(kernel_size=3, stride=2),
-            nn.Conv2d(64, 192, kernel_size=5, padding=2),
-            nn.ReLU(inplace=True),
-            nn.MaxPool2d(kernel_size=3, stride=2),
-            nn.Conv2d(192, 384, kernel_size=3, padding=1),
-            nn.ReLU(inplace=True),
-            nn.Conv2d(384, 256, kernel_size=3, padding=1),
-            nn.ReLU(inplace=True),
-            nn.Conv2d(256, 256, kernel_size=3, padding=1),
-            nn.ReLU(inplace=True),
-            nn.MaxPool2d(kernel_size=3, stride=2),
-        )
-        self.classifier = nn.Sequential(
-            nn.Linear(256 * 6 * 6, 4096),
-            nn.ReLU(inplace=True),
-            nn.Linear(4096, 4096),
-            nn.ReLU(inplace=True),
-            nn.Linear(4096, num_classes),
-            nn.Softmax(),
-        )
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.features(x)
-        x = torch.flatten(x, 1)
-        x = self.classifier(x)
-        return x
-
-input = torch.randn(64, 3, 224, 224)
-model = AlexNet(num_classes=10)
-torch.onnx.export(model, (input), "alexnet.onnx", export_params=False, training=TrainingMode.TRAINING)
diff --git a/examples/python/onnx/cifar10_cnn.py b/examples/python/onnx/cifar10_cnn.py
deleted file mode 100644
index f4272cdd5e..0000000000
--- a/examples/python/onnx/cifar10_cnn.py
+++ /dev/null
@@ -1,70 +0,0 @@
-from flexflow.core import *
-from flexflow.keras.datasets import cifar10
-from flexflow.onnx.model import ONNXModel, ONNXModelKeras
-import argparse
-
-from accuracy import ModelAccuracy
-
-def top_level_task(test_type=1):
-  ffconfig = FFConfig()
-  alexnetconfig = NetConfig()
-  print(alexnetconfig.dataset_path)
-  print("Python API batchSize(%d) workersPerNodes(%d) numNodes(%d)" %(ffconfig.batch_size, ffconfig.workers_per_node, ffconfig.num_nodes))
-  ffmodel = FFModel(ffconfig)
-  
-  dims_input = [ffconfig.batch_size, 3, 32, 32]
-  input = ffmodel.create_tensor(dims_input, DataType.DT_FLOAT)
-
-  if test_type == 1:
-    onnx_model = ONNXModel("cifar10_cnn_pt.onnx")
-    t = onnx_model.apply(ffmodel, {"input.1": input})
-  else:
-    onnx_model = ONNXModelKeras("cifar10_cnn_keras.onnx", ffconfig, ffmodel)
-    t = onnx_model.apply(ffmodel, {"input_1": input})
-
-  ffoptimizer = SGDOptimizer(ffmodel, 0.01)
-  ffmodel.optimizer = ffoptimizer
-  ffmodel.compile(loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
-  label = ffmodel.label_tensor
-  
-  num_samples = 10000
-  
-  (x_train, y_train), (x_test, y_test) = cifar10.load_data(num_samples)
-  
-  x_train = x_train.astype('float32')
-  x_train /= 255
-  full_input_array = x_train
-  print(full_input_array.__array_interface__["strides"])
-  
-  y_train = y_train.astype('int32')
-  full_label_array = y_train
-  
-  dataloader_input = ffmodel.create_data_loader(input, full_input_array)
-  dataloader_label = ffmodel.create_data_loader(label, full_label_array)
-  
-  num_samples = dataloader_input.num_samples
-
-  ffmodel.init_layers()
-
-  epochs = ffconfig.epochs
-
-  ts_start = ffconfig.get_current_time()
-  
-  ffmodel.fit(x=dataloader_input, y=dataloader_label, epochs=epochs)
-
-  ts_end = ffconfig.get_current_time()
-  run_time = 1e-6 * (ts_end - ts_start);
-  print("epochs %d, ELAPSED TIME = %.4fs, THROUGHPUT = %.2f samples/s\n" %(epochs, run_time, num_samples * epochs / run_time));
-  
-  perf_metrics = ffmodel.get_perf_metrics()
-  accuracy = perf_metrics.get_accuracy()
-  if accuracy < ModelAccuracy.CIFAR10_CNN.value:
-    assert 0, 'Check Accuracy'
-
-if __name__ == "__main__":
-  print("cifar10 cnn onnx")
-  parser = argparse.ArgumentParser()
-  parser.add_argument('--test_type', type=int, choices=[0, 1], help="Whether to test using Keras (test_type 0) or PyTorch (test_type 1) ")
-  args, unknown = parser.parse_known_args()
-  test_type = args.test_type
-  top_level_task(test_type)
diff --git a/examples/python/onnx/cifar10_cnn_keras.py b/examples/python/onnx/cifar10_cnn_keras.py
deleted file mode 100644
index d53b73a09e..0000000000
--- a/examples/python/onnx/cifar10_cnn_keras.py
+++ /dev/null
@@ -1,43 +0,0 @@
-from tensorflow.keras.models import Model, Sequential
-from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Activation, Input
-import keras2onnx
-import onnx
-
-from keras import backend
-backend.set_image_data_format('channels_first')
-
-num_classes = 10
-
-input_tensor1 = Input(shape=(3, 32, 32))
-
-output_tensor = Conv2D(filters=32, kernel_size=(3,3), strides=(1,1), padding="valid", activation="relu")(input_tensor1)
-output_tensor = Conv2D(filters=32, kernel_size=(3,3), strides=(1,1), padding="valid", activation="relu")(output_tensor)
-output_tensor = MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="valid")(output_tensor)
-output_tensor = Conv2D(filters=64, kernel_size=(3,3), strides=(1,1), padding="valid", activation="relu")(output_tensor)
-output_tensor = Conv2D(filters=64, kernel_size=(3,3), strides=(1,1), padding="valid", activation="relu")(output_tensor)
-output_tensor = MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="valid")(output_tensor)
-output_tensor = Flatten()(output_tensor)
-output_tensor = Dense(512, activation="relu")(output_tensor)
-output_tensor = Dense(num_classes)(output_tensor)
-output_tensor = Activation("softmax")(output_tensor)
-
-model = Model(input_tensor1, output_tensor)
-
-print(model.summary())
-print(model.get_layer(index=1).output.name)
-print(model.get_layer(index=1).input.name)
-
-onnx_model = keras2onnx.convert_keras(model, "mlp")
-onnx.save(onnx_model, "cifar10_cnn_keras.onnx")
-
-for node in onnx_model.graph.node:
-  print(node)
-#
-# for input in onnx_model.graph.initializer:
-#   print(input.name, input.dims, len(input.dims))
-#
-# for input in onnx_model.graph.input:
-#   print(input)
-#
-# for output in onnx_model.graph.output:
-#   print(output, type(output))
\ No newline at end of file
diff --git a/examples/python/onnx/cifar10_cnn_pt.py b/examples/python/onnx/cifar10_cnn_pt.py
deleted file mode 100644
index 6707408259..0000000000
--- a/examples/python/onnx/cifar10_cnn_pt.py
+++ /dev/null
@@ -1,45 +0,0 @@
-import onnx
-import torch
-import torch.nn as nn
-from torch.onnx import TrainingMode
-
-class CNN(nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.features = nn.Sequential(
-            nn.Conv2d(3, 32, 3, 1, 1),
-            nn.ReLU(),
-            nn.Conv2d(32, 32, 3, 1, 1),
-            nn.ReLU(),
-            nn.MaxPool2d(2, 2),
-            nn.Conv2d(32, 64, 3, 1, 1),
-            nn.ReLU(),
-            nn.Conv2d(64, 64, 3, 1, 1),
-            nn.ReLU(),
-            nn.MaxPool2d(2, 2))
-        self.classifier = nn.Sequential(
-            nn.Linear(4096, 512), 
-            nn.ReLU(),
-            nn.Linear(512, 10),
-            nn.ReLU(),
-            nn.Softmax())
-
-    def forward(self, x):
-        x = self.features(x)
-        print(x.shape)
-        x = torch.flatten(x, 1)
-        print(x.shape)
-        x = self.classifier(x)
-        return x
-
-input = torch.randn(64, 3, 32, 32)
-model = CNN()
-torch.onnx.export(model, (input), "cifar10_cnn_pt.onnx", export_params=False, training=TrainingMode.TRAINING)
-  
-onnx_model = onnx.load("cifar10_cnn_pt.onnx")
-
-for node in onnx_model.graph.node:
-  print(node)
-#
-# for input in onnx_model.graph.input:
-#   print(input)
diff --git a/examples/python/onnx/mnist_mlp.py b/examples/python/onnx/mnist_mlp.py
deleted file mode 100644
index 05c5a74278..0000000000
--- a/examples/python/onnx/mnist_mlp.py
+++ /dev/null
@@ -1,65 +0,0 @@
-from flexflow.core import *
-import numpy as np
-from flexflow.keras.datasets import mnist
-from flexflow.onnx.model import ONNXModel, ONNXModelKeras
-import argparse
-
-from accuracy import ModelAccuracy
-
-def top_level_task(test_type=1):
-  ffconfig = FFConfig()
-  print("Python API batchSize(%d) workersPerNodes(%d) numNodes(%d)" %(ffconfig.batch_size, ffconfig.workers_per_node, ffconfig.num_nodes))
-  ffmodel = FFModel(ffconfig)
-  
-  dims1 = [ffconfig.batch_size, 784]
-  input1 = ffmodel.create_tensor(dims1, DataType.DT_FLOAT);
-  
-  num_samples = 60000
-  
-  if test_type == 1:
-    onnx_model = ONNXModel("mnist_mlp_pt.onnx")
-    t = onnx_model.apply(ffmodel, {"input.1": input1})
-  else:
-    onnx_model = ONNXModelKeras("mnist_mlp_keras.onnx", ffconfig, ffmodel)
-    t = onnx_model.apply(ffmodel, {"input_1": input1})
-
-  ffoptimizer = SGDOptimizer(ffmodel, 0.01)
-  ffmodel.optimizer = ffoptimizer
-  ffmodel.compile(loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
-  label = ffmodel.label_tensor
-
-  (x_train, y_train), (x_test, y_test) = mnist.load_data()
-
-  x_train = x_train.reshape(60000, 784)
-  x_train = x_train.astype('float32')
-  x_train /= 255
-  y_train = y_train.astype('int32')
-  y_train = np.reshape(y_train, (len(y_train), 1))
-
-  dataloader_input = ffmodel.create_data_loader(input1, x_train)
-  dataloader_label = ffmodel.create_data_loader(label, y_train)
-
-  ffmodel.init_layers()
-
-  epochs = ffconfig.epochs
-
-  ts_start = ffconfig.get_current_time()
-
-  ffmodel.fit(x=dataloader_input, y=dataloader_label, epochs=epochs)
-
-  ts_end = ffconfig.get_current_time()
-  run_time = 1e-6 * (ts_end - ts_start);
-  print("epochs %d, ELAPSED TIME = %.4fs, THROUGHPUT = %.2f samples/s\n" %(epochs, run_time, num_samples * epochs / run_time));
-
-  perf_metrics = ffmodel.get_perf_metrics()
-  accuracy = perf_metrics.get_accuracy()
-  if accuracy < ModelAccuracy.MNIST_MLP.value:
-    assert 0, 'Check Accuracy'
-  
-if __name__ == "__main__":
-  print("mnist mlp onnx")
-  parser = argparse.ArgumentParser()
-  parser.add_argument('--test_type', type=int, choices=[0, 1], help="Whether to test using Keras (test_type 0) or PyTorch (test_type 1) ")
-  args, unknown = parser.parse_known_args()
-  test_type = args.test_type
-  top_level_task(test_type)
diff --git a/examples/python/onnx/mnist_mlp_keras.py b/examples/python/onnx/mnist_mlp_keras.py
deleted file mode 100644
index 8698696440..0000000000
--- a/examples/python/onnx/mnist_mlp_keras.py
+++ /dev/null
@@ -1,50 +0,0 @@
-from tensorflow.keras.models import Model, Sequential
-from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Activation, Input
-import keras2onnx
-import onnx
-
-from keras import backend
-backend.set_image_data_format('channels_first')
-
-num_classes = 10
-
-input_tensor = Input(shape=(784))
-output = Dense(512, activation="relu")(input_tensor)
-output = Dense(512, activation="relu")(output)
-output = Dense(num_classes)(output)
-output = Activation("softmax")(output)
-model = Model(inputs=input_tensor, outputs=output)
-
-# model = Sequential()
-# model.add(Dense(512, input_shape=(64,784)))
-# model.add(Activation('relu'))
-# model.add(Dense(512, activation="relu"))
-# model.add(Dense(num_classes))
-# model.add(Activation("softmax"))
-
-# layers = [Input(shape=(28, 28, 1,)),
-#           Conv2D(filters=32, kernel_size=(3,3), strides=(1,1), padding="valid", activation="relu"),
-#           Conv2D(filters=64, kernel_size=(3,3), strides=(1,1), padding="valid", activation="relu"),
-#           MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="valid"),
-#           Flatten(),
-#           Dense(128, activation="relu"),
-#           Dense(num_classes),
-#           Activation("softmax")]
-# model = Sequential(layers)
-
-onnx_model = keras2onnx.convert_keras(model, "mlp")
-onnx.save(onnx_model, "mnist_mlp_keras.onnx")
-
-for node in onnx_model.graph.node:
-  print(node)
-#
-# for input in onnx_model.graph.initializer:
-#   print(input.name, input.dims, len(input.dims))
-#   if '/bias' in input.name:
-#     print(input.name, type(input))
-#
-# for input in onnx_model.graph.input:
-#   print(input)
-#
-# for output in onnx_model.graph.output:
-#   print(output, type(output))
\ No newline at end of file
diff --git a/examples/python/onnx/mnist_mlp_pt.py b/examples/python/onnx/mnist_mlp_pt.py
deleted file mode 100644
index b1dd673eb1..0000000000
--- a/examples/python/onnx/mnist_mlp_pt.py
+++ /dev/null
@@ -1,32 +0,0 @@
-import torch
-import torch.nn as nn
-import onnx
-
-class MLP(nn.Module):
-  def __init__(self):
-    super().__init__()
-    self.linear1 = nn.Linear(784, 512)
-    self.linear2 = nn.Linear(512, 512)
-    self.linear3 = nn.Linear(512, 10)
-    self.relu = nn.ReLU()
-    self.softmax = nn.Softmax()
-
-  def forward(self, x):
-    y = self.linear1(x)
-    y = self.relu(y)
-    y = self.linear2(y)
-    y = self.relu(y)
-    y = self.linear3(y)
-    y = self.softmax(y)
-    return y
-
-input = torch.randn(100, 784)
-model = MLP()
-
-torch.onnx.export(model, (input), "mnist_mlp_pt.onnx", export_params=False)
-
-onnx_model = onnx.load("mnist_mlp_pt.onnx")
-# for input in onnx_model.graph.input:
-#   print(input)
-for node in onnx_model.graph.node:
-  print(node)
diff --git a/examples/python/onnx/resnet.py b/examples/python/onnx/resnet.py
deleted file mode 100644
index df4d6c7b6d..0000000000
--- a/examples/python/onnx/resnet.py
+++ /dev/null
@@ -1,72 +0,0 @@
-from flexflow.core import *
-from flexflow.keras.datasets import cifar10
-from flexflow.onnx.model import ONNXModel
-
-from accuracy import ModelAccuracy
-from PIL import Image
-import numpy as np
-
-def top_level_task():
-  ffconfig = FFConfig()
-  alexnetconfig = NetConfig()
-  print(alexnetconfig.dataset_path)
-  print("Python API batchSize(%d) workersPerNodes(%d) numNodes(%d)" %(ffconfig.batch_size, ffconfig.workers_per_node, ffconfig.num_nodes))
-  ffmodel = FFModel(ffconfig)
-  
-  dims_input = [ffconfig.batch_size, 3, 229, 229]
-  input = ffmodel.create_tensor(dims_input, DataType.DT_FLOAT)
-
-  onnx_model = ONNXModel("resnet18.onnx")
-  t = onnx_model.apply(ffmodel, {"input.1": input})
-  t = ffmodel.softmax(t)
-
-  ffoptimizer = SGDOptimizer(ffmodel, 0.01)
-  ffmodel.optimizer = ffoptimizer
-  ffmodel.compile(loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
-  label = ffmodel.label_tensor
-  
-  num_samples = 10000
-  
-  (x_train, y_train), (x_test, y_test) = cifar10.load_data(num_samples)
-
-  full_input_np = np.zeros((num_samples, 3, 229, 229), dtype=np.float32)
-  
-  for i in range(0, num_samples):
-    image = x_train[i, :, :, :]
-    image = image.transpose(1, 2, 0)
-    pil_image = Image.fromarray(image)
-    pil_image = pil_image.resize((229,229), Image.NEAREST)
-    image = np.array(pil_image, dtype=np.float32)
-    image = image.transpose(2, 0, 1)
-    full_input_np[i, :, :, :] = image
-
-  full_input_np /= 255
-  
-  y_train = y_train.astype('int32')
-  full_label_np = y_train
-  
-  dataloader_input = ffmodel.create_data_loader(input, full_input_np)
-  dataloader_label = ffmodel.create_data_loader(label, full_label_np)
-  
-  num_samples = dataloader_input.num_samples
-  assert dataloader_input.num_samples == dataloader_label.num_samples
-
-  ffmodel.init_layers()
-
-  epochs = ffconfig.epochs
-
-  ts_start = ffconfig.get_current_time()
-
-  ffmodel.fit(x=dataloader_input, y=dataloader_label, epochs=epochs)
-
-  ts_end = ffconfig.get_current_time()
-  run_time = 1e-6 * (ts_end - ts_start);
-  print("epochs %d, ELAPSED TIME = %.4fs, THROUGHPUT = %.2f samples/s\n" %(epochs, run_time, num_samples * epochs / run_time));
-  perf_metrics = ffmodel.get_perf_metrics()
-  accuracy = perf_metrics.get_accuracy()
-  if accuracy < ModelAccuracy.CIFAR10_ALEXNET.value:
-    assert 0, 'Check Accuracy'
-
-if __name__ == "__main__":
-  print("resnet onnx")
-  top_level_task()
diff --git a/examples/python/onnx/resnet_pt.py b/examples/python/onnx/resnet_pt.py
deleted file mode 100644
index a1e5fcb650..0000000000
--- a/examples/python/onnx/resnet_pt.py
+++ /dev/null
@@ -1,380 +0,0 @@
-import torch
-from torch import Tensor
-import torch.nn as nn
-from typing import Type, Any, Callable, Union, List, Optional
-from torch.onnx import TrainingMode
-import onnx
-
-def conv3x3(in_planes: int, out_planes: int, stride: int = 1, groups: int = 1, dilation: int = 1) -> nn.Conv2d:
-    """3x3 convolution with padding"""
-    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
-                     padding=dilation, groups=groups, bias=False, dilation=dilation)
-
-
-def conv1x1(in_planes: int, out_planes: int, stride: int = 1) -> nn.Conv2d:
-    """1x1 convolution"""
-    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
-
-
-class BasicBlock(nn.Module):
-    expansion: int = 1
-
-    def __init__(
-        self,
-        inplanes: int,
-        planes: int,
-        stride: int = 1,
-        downsample: Optional[nn.Module] = None,
-        groups: int = 1,
-        base_width: int = 64,
-        dilation: int = 1,
-        norm_layer: Optional[Callable[..., nn.Module]] = None
-    ) -> None:
-        super(BasicBlock, self).__init__()
-        if norm_layer is None:
-            norm_layer = nn.BatchNorm2d
-        if groups != 1 or base_width != 64:
-            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
-        if dilation > 1:
-            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
-        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
-        self.conv1 = conv3x3(inplanes, planes, stride)
-        self.bn1 = norm_layer(planes)
-        self.relu = nn.ReLU(inplace=True)
-        self.conv2 = conv3x3(planes, planes)
-        self.bn2 = norm_layer(planes)
-        self.downsample = downsample
-        self.stride = stride
-
-    def forward(self, x: Tensor) -> Tensor:
-        identity = x
-
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.relu(out)
-
-        out = self.conv2(out)
-        out = self.bn2(out)
-
-        if self.downsample is not None:
-            identity = self.downsample(x)
-
-        out += identity
-        out = self.relu(out)
-
-        return out
-
-
-class Bottleneck(nn.Module):
-    # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
-    # while original implementation places the stride at the first 1x1 convolution(self.conv1)
-    # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
-    # This variant is also known as ResNet V1.5 and improves accuracy according to
-    # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
-
-    expansion: int = 4
-
-    def __init__(
-        self,
-        inplanes: int,
-        planes: int,
-        stride: int = 1,
-        downsample: Optional[nn.Module] = None,
-        groups: int = 1,
-        base_width: int = 64,
-        dilation: int = 1,
-        norm_layer: Optional[Callable[..., nn.Module]] = None
-    ) -> None:
-        super(Bottleneck, self).__init__()
-        if norm_layer is None:
-            norm_layer = nn.BatchNorm2d
-        width = int(planes * (base_width / 64.)) * groups
-        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
-        self.conv1 = conv1x1(inplanes, width)
-        self.bn1 = norm_layer(width)
-        self.conv2 = conv3x3(width, width, stride, groups, dilation)
-        self.bn2 = norm_layer(width)
-        self.conv3 = conv1x1(width, planes * self.expansion)
-        self.bn3 = norm_layer(planes * self.expansion)
-        self.relu = nn.ReLU(inplace=True)
-        self.downsample = downsample
-        self.stride = stride
-
-    def forward(self, x: Tensor) -> Tensor:
-        identity = x
-
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.relu(out)
-
-        out = self.conv2(out)
-        out = self.bn2(out)
-        out = self.relu(out)
-
-        out = self.conv3(out)
-        out = self.bn3(out)
-
-        if self.downsample is not None:
-            identity = self.downsample(x)
-
-        out += identity
-        out = self.relu(out)
-
-        return out
-
-
-class ResNet(nn.Module):
-
-    def __init__(
-        self,
-        block: Type[Union[BasicBlock, Bottleneck]],
-        layers: List[int],
-        num_classes: int = 1000,
-        zero_init_residual: bool = False,
-        groups: int = 1,
-        width_per_group: int = 64,
-        replace_stride_with_dilation: Optional[List[bool]] = None,
-        norm_layer: Optional[Callable[..., nn.Module]] = None
-    ) -> None:
-        super(ResNet, self).__init__()
-        if norm_layer is None:
-            norm_layer = nn.BatchNorm2d
-        self._norm_layer = norm_layer
-
-        self.inplanes = 64
-        self.dilation = 1
-        if replace_stride_with_dilation is None:
-            # each element in the tuple indicates if we should replace
-            # the 2x2 stride with a dilated convolution instead
-            replace_stride_with_dilation = [False, False, False]
-        if len(replace_stride_with_dilation) != 3:
-            raise ValueError("replace_stride_with_dilation should be None "
-                             "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
-        self.groups = groups
-        self.base_width = width_per_group
-        self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3,
-                               bias=False)
-        self.bn1 = norm_layer(self.inplanes)
-        self.relu = nn.ReLU(inplace=True)
-        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
-        self.layer1 = self._make_layer(block, 64, layers[0])
-        self.layer2 = self._make_layer(block, 128, layers[1], stride=2,
-                                       dilate=replace_stride_with_dilation[0])
-        self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
-                                       dilate=replace_stride_with_dilation[1])
-        self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
-                                       dilate=replace_stride_with_dilation[2])
-        self.avgpool = nn.AvgPool2d(7, stride=1)
-        self.fc = nn.Linear(512 * block.expansion, num_classes)
-
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
-            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
-                nn.init.constant_(m.weight, 1)
-                nn.init.constant_(m.bias, 0)
-
-        # Zero-initialize the last BN in each residual branch,
-        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
-        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
-        if zero_init_residual:
-            for m in self.modules():
-                if isinstance(m, Bottleneck):
-                    nn.init.constant_(m.bn3.weight, 0)  # type: ignore[arg-type]
-                elif isinstance(m, BasicBlock):
-                    nn.init.constant_(m.bn2.weight, 0)  # type: ignore[arg-type]
-
-    def _make_layer(self, block: Type[Union[BasicBlock, Bottleneck]], planes: int, blocks: int,
-                    stride: int = 1, dilate: bool = False) -> nn.Sequential:
-        norm_layer = self._norm_layer
-        downsample = None
-        previous_dilation = self.dilation
-        if dilate:
-            self.dilation *= stride
-            stride = 1
-        if stride != 1 or self.inplanes != planes * block.expansion:
-            downsample = nn.Sequential(
-                conv1x1(self.inplanes, planes * block.expansion, stride),
-                norm_layer(planes * block.expansion),
-            )
-
-        layers = []
-        layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
-                            self.base_width, previous_dilation, norm_layer))
-        self.inplanes = planes * block.expansion
-        for _ in range(1, blocks):
-            layers.append(block(self.inplanes, planes, groups=self.groups,
-                                base_width=self.base_width, dilation=self.dilation,
-                                norm_layer=norm_layer))
-
-        return nn.Sequential(*layers)
-
-    def _forward_impl(self, x: Tensor) -> Tensor:
-        # See note [TorchScript super()]
-        x = self.conv1(x)
-        x = self.bn1(x)
-        x = self.relu(x)
-        x = self.maxpool(x)
-
-        x = self.layer1(x)
-        x = self.layer2(x)
-        x = self.layer3(x)
-        x = self.layer4(x)
-
-        x = self.avgpool(x)
-        x = torch.flatten(x, 1)
-        x = self.fc(x)
-
-        return x
-
-    def forward(self, x: Tensor) -> Tensor:
-        return self._forward_impl(x)
-
-
-def _resnet(
-    arch: str,
-    block: Type[Union[BasicBlock, Bottleneck]],
-    layers: List[int],
-    pretrained: bool,
-    progress: bool,
-    **kwargs: Any
-) -> ResNet:
-    model = ResNet(block, layers, **kwargs)
-    return model
-
-
-def resnet18(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
-    r"""ResNet-18 model from
-    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-        progress (bool): If True, displays a progress bar of the download to stderr
-    """
-    return _resnet('resnet18', BasicBlock, [2, 2, 2, 2], pretrained, progress,
-                   **kwargs)
-
-
-def resnet34(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
-    r"""ResNet-34 model from
-    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-        progress (bool): If True, displays a progress bar of the download to stderr
-    """
-    return _resnet('resnet34', BasicBlock, [3, 4, 6, 3], pretrained, progress,
-                   **kwargs)
-
-
-def resnet50(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
-    r"""ResNet-50 model from
-    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-        progress (bool): If True, displays a progress bar of the download to stderr
-    """
-    return _resnet('resnet50', Bottleneck, [3, 4, 6, 3], pretrained, progress,
-                   **kwargs)
-
-
-def resnet101(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
-    r"""ResNet-101 model from
-    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-        progress (bool): If True, displays a progress bar of the download to stderr
-    """
-    return _resnet('resnet101', Bottleneck, [3, 4, 23, 3], pretrained, progress,
-                   **kwargs)
-
-
-def resnet152(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
-    r"""ResNet-152 model from
-    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-        progress (bool): If True, displays a progress bar of the download to stderr
-    """
-    return _resnet('resnet152', Bottleneck, [3, 8, 36, 3], pretrained, progress,
-                   **kwargs)
-
-
-def resnext50_32x4d(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
-    r"""ResNeXt-50 32x4d model from
-    `"Aggregated Residual Transformation for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-        progress (bool): If True, displays a progress bar of the download to stderr
-    """
-    kwargs['groups'] = 32
-    kwargs['width_per_group'] = 4
-    return _resnet('resnext50_32x4d', Bottleneck, [3, 4, 6, 3],
-                   pretrained, progress, **kwargs)
-
-
-def resnext101_32x8d(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
-    r"""ResNeXt-101 32x8d model from
-    `"Aggregated Residual Transformation for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-        progress (bool): If True, displays a progress bar of the download to stderr
-    """
-    kwargs['groups'] = 32
-    kwargs['width_per_group'] = 8
-    return _resnet('resnext101_32x8d', Bottleneck, [3, 4, 23, 3],
-                   pretrained, progress, **kwargs)
-
-
-def wide_resnet50_2(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
-    r"""Wide ResNet-50-2 model from
-    `"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_.
-
-    The model is the same as ResNet except for the bottleneck number of channels
-    which is twice larger in every block. The number of channels in outer 1x1
-    convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
-    channels, and in Wide ResNet-50-2 has 2048-1024-2048.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-        progress (bool): If True, displays a progress bar of the download to stderr
-    """
-    kwargs['width_per_group'] = 64 * 2
-    return _resnet('wide_resnet50_2', Bottleneck, [3, 4, 6, 3],
-                   pretrained, progress, **kwargs)
-
-
-def wide_resnet101_2(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
-    r"""Wide ResNet-101-2 model from
-    `"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_.
-
-    The model is the same as ResNet except for the bottleneck number of channels
-    which is twice larger in every block. The number of channels in outer 1x1
-    convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
-    channels, and in Wide ResNet-50-2 has 2048-1024-2048.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-        progress (bool): If True, displays a progress bar of the download to stderr
-    """
-    kwargs['width_per_group'] = 64 * 2
-    return _resnet('wide_resnet101_2', Bottleneck, [3, 4, 23, 3],
-                   pretrained, progress, **kwargs)
-
-
-input = torch.randn(64, 3, 224, 224)
-model = resnet18()
-torch.onnx.export(model, (input), "resnet18.onnx", export_params=False, training=TrainingMode.TRAINING)
-
-onnx_model = onnx.load("resnet18.onnx")
-
-for node in onnx_model.graph.node:
-  print(node)
-
-for input in onnx_model.graph.input:
-  print(input)
diff --git a/examples/python/pytorch/cifar10_cnn.py b/examples/python/pytorch/cifar10_cnn.py
deleted file mode 100644
index 44ee1a382c..0000000000
--- a/examples/python/pytorch/cifar10_cnn.py
+++ /dev/null
@@ -1,67 +0,0 @@
-from flexflow.core import *
-from flexflow.keras.datasets import cifar10
-from flexflow.torch.model import file_to_ff
-
-#from accuracy import ModelAccuracy
-
-def top_level_task():
-  ffconfig = FFConfig()
-  print("Python API batchSize(%d) workersPerNodes(%d) numNodes(%d)" %(ffconfig.batch_size, ffconfig.workers_per_node, ffconfig.num_nodes))
-  ffmodel = FFModel(ffconfig)
-
-  dims_input = [ffconfig.batch_size, 3, 32, 32]
-  input_tensor = ffmodel.create_tensor(dims_input, DataType.DT_FLOAT)
-  output_tensors = file_to_ff("cnn.ff", ffmodel, [input_tensor, input_tensor])
-
-  t = ffmodel.softmax(output_tensors[0])
-
-  ffoptimizer = SGDOptimizer(ffmodel, 0.01)
-  ffmodel.optimizer = ffoptimizer
-  ffmodel.compile(loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
-  label_tensor = ffmodel.label_tensor
-
-  num_samples = 10000
-
-  (x_train, y_train), (x_test, y_test) = cifar10.load_data(num_samples)
-
-  x_train = x_train.astype('float32')
-  x_train /= 255
-  full_input_array = x_train
-
-  y_train = y_train.astype('int32')
-  full_label_array = y_train
-
-  dataloader_input = ffmodel.create_data_loader(input_tensor, full_input_array)
-  dataloader_label = ffmodel.create_data_loader(label_tensor, full_label_array)
-
-  num_samples = dataloader_input.num_samples
-
-  ffmodel.init_layers()
-
-  layers = ffmodel.get_layers()
-  for layer in layers:
-    print(layers[layer].name)
-
-  layer = ffmodel.get_layer_by_name("relu_1")
-  print(layer)
-
-  epochs = ffconfig.epochs
-
-  ts_start = ffconfig.get_current_time()
-  
-  ffmodel.fit(x=dataloader_input, y=dataloader_label, epochs=epochs)
-
-  ts_end = ffconfig.get_current_time()
-  run_time = 1e-6 * (ts_end - ts_start);
-  print("epochs %d, ELAPSED TIME = %.4fs, THROUGHPUT = %.2f samples/s\n" %(epochs, run_time, num_samples * epochs / run_time));
-
-  # perf_metrics = ffmodel.get_perf_metrics()
-  # accuracy = perf_metrics.get_accuracy()
-  # if accuracy < ModelAccuracy.CIFAR10_CNN.value:
-  #   assert 0, 'Check Accuracy'
-
-
-if __name__ == "__main__":
-  print("cifar10 cnn")
-  top_level_task()
-
diff --git a/examples/python/pytorch/cifar10_cnn_torch.py b/examples/python/pytorch/cifar10_cnn_torch.py
deleted file mode 100644
index d9b2a60b4d..0000000000
--- a/examples/python/pytorch/cifar10_cnn_torch.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import torch.nn as nn
-import torch
-from flexflow.torch.model import PyTorchModel
-
-class CNN(nn.Module):
-  def __init__(self):
-    super().__init__()
-    self.conv1 = nn.Conv2d(3, 32, 3, 1)
-    self.conv2 = nn.Conv2d(32, 32, 3, 1)
-    self.pool1 = nn.MaxPool2d(2, 2)
-    self.conv3 = nn.Conv2d(32, 64, 3, 1)
-    self.conv4 = nn.Conv2d(64, 64, 3, 1)
-    self.pool2 = nn.MaxPool2d(2, 2)
-    self.flat1 = nn.Flatten()
-    self.linear1 = nn.Linear(512, 512)
-    self.linear2 = nn.Linear(512, 10)
-    self.relu = nn.ReLU()
-
-  def forward(self, input1, input2):
-    y1 = self.conv1(input1)
-    y1 = self.relu(y1)
-    y2 = self.conv1(input2)
-    y2 = self.relu(y2)
-    y = torch.cat((y1, y2), 1)
-    (y1, y2) = torch.split(y, 1) 
-    y = torch.cat((y1, y2), 1)
-    y = self.conv2(y)
-    y = self.relu(y)
-    y = self.pool1(y)
-    y = self.conv3(y)
-    y = self.relu(y)
-    y = self.conv4(y)
-    y = self.relu(y)
-    y = self.pool2(y)
-    y = self.flat1(y)
-    y = self.linear1(y)
-    y = self.relu(y)
-    yo = self.linear2(y)
-    return (yo, y)
-
-model = CNN()
-ff_torch_model = PyTorchModel(model)
-ff_torch_model.torch_to_file("cnn.ff")
-
diff --git a/examples/python/pytorch/export_regnet_fx.py b/examples/python/pytorch/export_regnet_fx.py
deleted file mode 100644
index bafb20fc83..0000000000
--- a/examples/python/pytorch/export_regnet_fx.py
+++ /dev/null
@@ -1,8 +0,0 @@
-import classy_vision.models.regnet as rgn
-from flexflow.torch.model import PyTorchModel
-import torch.nn as nn
-
-model = rgn.RegNetX32gf()
-model = nn.Sequential(model,nn.Flatten(),nn.Linear(2520*7*7,1000))
-ff_torch_model = PyTorchModel(model)
-ff_torch_model.torch_to_file("regnetX32gf.ff")
diff --git a/examples/python/pytorch/mnist_mlp.py b/examples/python/pytorch/mnist_mlp.py
deleted file mode 100644
index 1384b2907b..0000000000
--- a/examples/python/pytorch/mnist_mlp.py
+++ /dev/null
@@ -1,56 +0,0 @@
-from flexflow.core import *
-import numpy as np
-from flexflow.keras.datasets import mnist
-from flexflow.torch.model import PyTorchModel
-
-#from accuracy import ModelAccuracy
-
-def top_level_task():
-  ffconfig = FFConfig()
-  print("Python API batchSize(%d) workersPerNodes(%d) numNodes(%d)" %(ffconfig.batch_size, ffconfig.workers_per_node, ffconfig.num_nodes))
-  ffmodel = FFModel(ffconfig)
-
-  dims = [ffconfig.batch_size, 784]
-  input_tensor = ffmodel.create_tensor(dims, DataType.DT_FLOAT);
-
-  num_samples = 60000
-
-  output_tensors = PyTorchModel.file_to_ff("mlp.ff", ffmodel, [input_tensor])
-
-  ffoptimizer = SGDOptimizer(ffmodel, 0.01)
-  ffmodel.optimizer = ffoptimizer
-  ffmodel.compile(loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
-  label_tensor = ffmodel.label_tensor
-
-  (x_train, y_train), (x_test, y_test) = mnist.load_data()
-
-  print(x_train.shape)
-  x_train = x_train.reshape(60000, 784)
-  x_train = x_train.astype('float32')
-  x_train /= 255
-  y_train = y_train.astype('int32')
-  y_train = np.reshape(y_train, (len(y_train), 1))
-
-  dataloader_input = ffmodel.create_data_loader(input_tensor, x_train)
-  dataloader_label = ffmodel.create_data_loader(label_tensor, y_train)
-
-  ffmodel.init_layers()
-
-  epochs = ffconfig.epochs
-
-  ts_start = ffconfig.get_current_time()
-  
-  ffmodel.fit(x=dataloader_input, y=dataloader_label, epochs=epochs)
-
-  ts_end = ffconfig.get_current_time()
-  run_time = 1e-6 * (ts_end - ts_start);
-  print("epochs %d, ELAPSED TIME = %.4fs, THROUGHPUT = %.2f samples/s\n" %(epochs, run_time, num_samples * epochs / run_time));
-
-  # perf_metrics = ffmodel.get_perf_metrics()
-  # accuracy = perf_metrics.get_accuracy()
-  # if accuracy < ModelAccuracy.MNIST_MLP.value:
-  #   assert 0, 'Check Accuracy'
-
-if __name__ == "__main__":
-  print("mnist mlp")
-  top_level_task()
diff --git a/examples/python/pytorch/mnist_mlp_torch.py b/examples/python/pytorch/mnist_mlp_torch.py
deleted file mode 100644
index 4b31a9bc35..0000000000
--- a/examples/python/pytorch/mnist_mlp_torch.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import torch.nn as nn
-from flexflow.torch.model import PyTorchModel
-
-class MLP(nn.Module):
-  def __init__(self):
-    super().__init__()
-    self.linear1 = nn.Linear(784, 512)
-    self.linear2 = nn.Linear(512, 512)
-    self.linear3 = nn.Linear(512, 10)
-    self.relu = nn.ReLU()
-    self.softmax = nn.Softmax()
-
-  def forward(self, x):
-    y = self.linear1(x)
-    y = self.relu(y)
-    y = self.linear2(y)
-    y = self.relu(y)
-    y = self.linear3(y)
-    y = self.softmax(y)
-    return y
-
-model = MLP()
-ff_torch_model = PyTorchModel(model)
-ff_torch_model.torch_to_file("mlp.ff")
-
diff --git a/examples/python/pytorch/mnist_mlp_torch2.py b/examples/python/pytorch/mnist_mlp_torch2.py
deleted file mode 100644
index 10e3be111c..0000000000
--- a/examples/python/pytorch/mnist_mlp_torch2.py
+++ /dev/null
@@ -1,76 +0,0 @@
-import torch.nn as nn
-from flexflow.core import *
-import numpy as np
-from flexflow.keras.datasets import mnist
-from flexflow.torch.model import PyTorchModel
-
-class MLP(nn.Module):
-  def __init__(self):
-    super().__init__()
-    self.linear1 = nn.Linear(784, 512)
-    self.linear2 = nn.Linear(512, 512)
-    self.linear3 = nn.Linear(512, 10)
-    self.relu = nn.ReLU()
-    self.softmax = nn.Softmax()
-
-  def forward(self, x):
-    y = self.linear1(x)
-    y = self.relu(y)
-    y = self.linear2(y)
-    y = self.relu(y)
-    y = self.linear3(y)
-    y = self.softmax(y)
-    return y
-
-def top_level_task():
-  ffconfig = FFConfig()
-  print("Python API batchSize(%d) workersPerNodes(%d) numNodes(%d)" %(ffconfig.batch_size, ffconfig.workers_per_node, ffconfig.num_nodes))
-  ffmodel = FFModel(ffconfig)
-
-  dims = [ffconfig.batch_size, 784]
-  input_tensor = ffmodel.create_tensor(dims, DataType.DT_FLOAT);
-
-  num_samples = 60000
-  
-  model = MLP()
-  
-  ff_torch_model = PyTorchModel(model)
-  output_tensors = ff_torch_model.torch_to_ff(ffmodel, [input_tensor])
-
-  ffoptimizer = SGDOptimizer(ffmodel, 0.01)
-  ffmodel.optimizer = ffoptimizer
-  ffmodel.compile(loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
-  label_tensor = ffmodel.label_tensor
-
-  (x_train, y_train), (x_test, y_test) = mnist.load_data()
-
-  print(x_train.shape)
-  x_train = x_train.reshape(60000, 784)
-  x_train = x_train.astype('float32')
-  x_train /= 255
-  y_train = y_train.astype('int32')
-  y_train = np.reshape(y_train, (len(y_train), 1))
-
-  dataloader_input = ffmodel.create_data_loader(input_tensor, x_train)
-  dataloader_label = ffmodel.create_data_loader(label_tensor, y_train)
-
-  ffmodel.init_layers()
-
-  epochs = ffconfig.epochs
-
-  ts_start = ffconfig.get_current_time()
-  
-  ffmodel.fit(x=dataloader_input, y=dataloader_label, epochs=epochs)
-
-  ts_end = ffconfig.get_current_time()
-  run_time = 1e-6 * (ts_end - ts_start);
-  print("epochs %d, ELAPSED TIME = %.4fs, THROUGHPUT = %.2f samples/s\n" %(epochs, run_time, num_samples * epochs / run_time));
-
-  # perf_metrics = ffmodel.get_perf_metrics()
-  # accuracy = perf_metrics.get_accuracy()
-  # if accuracy < ModelAccuracy.MNIST_MLP.value:
-  #   assert 0, 'Check Accuracy'
-
-if __name__ == "__main__":
-  print("mnist mlp")
-  top_level_task()
diff --git a/examples/python/pytorch/mt5/.gitignore b/examples/python/pytorch/mt5/.gitignore
deleted file mode 100644
index a3f13c5f6e..0000000000
--- a/examples/python/pytorch/mt5/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-data
-*.tar
diff --git a/examples/python/pytorch/mt5/README.md b/examples/python/pytorch/mt5/README.md
deleted file mode 100644
index 2b5f1c580a..0000000000
--- a/examples/python/pytorch/mt5/README.md
+++ /dev/null
@@ -1,127 +0,0 @@
-# HuggingFace mT5 in FlexFlow
-## Prerequisites and Setup
-
-We mention a few prerequisites and tips for setting up.
-- We assume access to at least one GPU and an installation of Anaconda.
-- We assume PyTorch version 1.9.
-- Using PyTorch and FlexFlow concurrently requires a CPU version of PyTorch.
-    - To install the CPU version of `torch` (and `torchvision`), run:
-    ```
-    conda install pytorch==1.9.0 torchvision==0.10.0 cpuonly -c pytorch
-    ```
-    - To install the CPU version of `torch` from source, clone the [repository](https://github.com/pytorch/pytorch/tree/release/1.9), run `export USE_CUDA=0 USE_CUDNN=0 USE_MKLDNN=1`, run `git submodule sync; git submodule update --init --recursive`, and run `python setup.py develop` (or `python setup.py install`).
-- We need an installation of the HuggingFace `transformers` repository.
-    - To install `transformers`, run:
-    ```
-    conda install -c conda-forge transformers
-    ```
-    
-    - To install `transformers` from source, clone the [repository](https://github.com/huggingface/transformers/tree/v4.10.2-release), and run `python setup.py develop` (or `python setup.py install`).
-- To run PyTorch-FlexFlow examples, make sure to run `export FF_USE_CFFI=1` to use `cffi` instead of `pybind11`.
-- Additional notes:
-    - You may need to update `huggingface_hub` with:
-    ```
-    conda update huggingface_hub
-    ```
-    - If you encounter `ImportError: Found an incompatible version of torch.`, try updating to a later version of `transformers`.
-
-
-
-## mT5 in PyTorch
-We present an example of training mT5 for the Sinhalese-English translation
-task from
-[here](https://towardsdatascience.com/how-to-train-an-mt5-model-for-translation-with-simple-transformers-30ba5fa66c5f),
-reusing some code from
-[here](https://shivanandroy.com/fine-tune-t5-transformer-with-pytorch/). In
-this section, we walk through the training script using PyTorch, and in the
-next section, we walk through the training script using FlexFlow. The
-corresponding code may be found in `mt5_torch.py` and `mt5_ff.py`,
-respectively.
-
-To download and uncompress the dataset, run:
-```
-cd examples/python/pytorch/mt5
-wget https://object.pouta.csc.fi/Tatoeba-Challenge/eng-sin.tar
-tar -xvf eng-sin.tar
-gzip -d data/eng-sin/*.gz
-```
-
-This will create a directory `data/` containing a single subdirectory
-`data/eng-sin/` containing `test.id`, `test.src`, `test.trg`, `train.id`,
-`train.src`, and `train.trg`.
-
-We extract, prepare, and save the data to `.tsv` by using
-`DataPreparer.data_to_tsv()` -- this creates two new files, `data/train.tsv` and
-`data/eval.tsv`, and only needs to be done once. Then, we can train using those
-`.tsv` files. A base implementation for this may be found in `mt5_torch.py`,
-which saves the `.tsv` files, trains for some number of epochs, and outputs a
-`.csv` containing the predicted and actual text on the evaluation data.
-```
-python examples/python/pytorch/mt5/mt5_torch.py
-```
-_Note:_ Running `mt5_torch.py` requires a GPU-version of PyTorch.
-
-
-## mT5 in FlexFlow
-
-Now, we examine how to write a similar training script using FlexFlow. To
-begin, FlexFlow dataloaders expect the data to be passed in as `numpy` arrays
-and to be already preprocessed so that batches may be directly given to the
-model. In `mt5_ff.py`, `data_to_numpy()` converts the `.tsv` files to `.npy`,
-and `preprocess_train()` performs the necessary preprocessing.
-
-_Note:_ `data_to_numpy()` takes a while to run.
-
-Next, following the conventional FlexFlow terminology, we define a _top-level
-task_ to train the mT5 model. The key steps are as follows (including some
-notable code snippets):
-- Define `ffconfig = FFConfig()` and `ffmodel = FFModel(ffconfig)` -- `ffmodel` is the Python object for the FlexFlow model
-- Define the PyTorch mT5 model:
-    ```
-    model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")
-    ```
-- Load the preprocessed training data from the `.npy` files
-- Use `ffmodel.create_tensor()` for the `input_ids`, `attention_mask`, and `decoder_input_ids` -- these are the input tensors to the model
-- Construct a `PyTorchModel()` object wrapping the PyTorch model `model` to enable conversion to FlexFlow:
-    ```
-    hf_model = PyTorchModel(
-        model, is_hf_model=True, batch_size=ffconfig.batch_size,
-        seq_length=seq_length,
-    )
-    ```
-    - We pass `is_hf_model=True` since HuggingFace models require a special `symbolic_trace()` distinct from the native PyTorch one.
-    - `seq_length` is a tuple `(encoder_seq_length, decoder_seq_length)`.
-- Convert the model to FlexFlow:
-    ```
-    output_tensors = hf_model.to_ff(ffmodel, input_tensors)
-    ```
-- Define the optimizer `ffoptimizer`
-- Compile the model:
-    ```
-    ffmodel.compile(
-        optimizer=ffoptimizer,
-        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
-        metrics=[
-            MetricsType.METRICS_ACCURACY,
-            MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY,
-        ],
-    )
-    ```
-- Create the dataloaders for the `input_ids`, `attention_mask`, `decoder_input_ids`, and `labels`
-- Initialize the model layers:
-    ```
-    ffmodel.init_layers()
-    ```
-- Train the model, passing the appropriate dataloaders into `fit()`:
-    ```
-    ffmodel.fit(
-        x=[input_ids_dl, attention_mask_dl, decoder_ids_dl],
-        y=labels_dl, batch_size=batch_size, epochs=epochs,
-    )
-    ```
-
-A base implementation may be found in `mt5_ff.py`.
-```
-./python/flexflow_python examples/python/pytorch/mt5/mt5_ff.py -ll:py 1 -ll:gpu 1 -ll:fsize 14000 -ll:zsize 4096
-```
-_Note:_ Running `mt5_ff.py` requires a CPU-version of PyTorch.
diff --git a/examples/python/pytorch/mt5/mt5_ff.py b/examples/python/pytorch/mt5/mt5_ff.py
deleted file mode 100644
index 41b84a269e..0000000000
--- a/examples/python/pytorch/mt5/mt5_ff.py
+++ /dev/null
@@ -1,168 +0,0 @@
-import itertools
-import os
-import sys
-
-import numpy as np
-from flexflow.core import *
-from flexflow.torch.model import PyTorchModel
-from transformers import MT5ForConditionalGeneration, T5Tokenizer
-
-sys.path.append("./examples/python/pytorch/mt5")
-from mt5_torch import DataPreparer, get_dataloaders, set_seed
-
-BASE_DIR = "examples/python/pytorch/mt5"
-DATA_DIR = os.path.join(BASE_DIR, "data")
-NUMPY_DIR = os.path.join(DATA_DIR, "numpy")
-
-
-def data_to_numpy() -> None:
-    """
-    Generates the files:
-        - `train_source_ids.npy`
-        - `train_source_mask.npy`
-        - `train_target_ids.npy`
-        - `eval_source_ids.npy`
-        - `eval_source_mask.npy`
-        - `eval_target_ids.npy`
-    This function should only need to be called once (to generate these files).
-    """
-    model_params = {
-        "SEED": 42,
-        "MODEL": "google/mt5-small",
-        "TRAIN_BATCH_SIZE": None,  # use the full dataset as one batch
-        "EVAL_BATCH_SIZE": None,   # use the full dataset as one batch
-        "TRAIN_EPOCHS": 1,         # unused
-        "MAX_SOURCE_TEXT_LENGTH": 48,
-        "MAX_TARGET_TEXT_LENGTH": 48,
-    }
-    set_seed(model_params)
-    tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])
-    print("Getting dataloaders...")
-    train_loader, eval_loader = get_dataloaders(tokenizer, model_params)
-    assert len(train_loader) == 1
-    assert len(eval_loader) == 1
-    print("Saving to numpy...")
-    train_set_dict = next(iter(train_loader))
-    eval_set_dict = next(iter(eval_loader))
-    for k, v in train_set_dict.items():
-        np.save(os.path.join(NUMPY_DIR, f"train_{k}.npy"), v.numpy())
-    for k, v in eval_set_dict.items():
-        np.save(os.path.join(NUMPY_DIR, f"eval_{k}.npy"), v.numpy())
-
-
-def preprocess_train() -> None:
-    """
-    Generates the files:
-        - `train_y_ids.npy`
-        - `train_lm_labels.npy`
-    This function should only need to be called once (to generate these files).
-    """
-    y = np.load(os.path.join(NUMPY_DIR, "train_target_ids.npy"))
-    y_shape = y.shape
-    assert len(y.shape) == 2, \
-        "`y` should have shape (num examples, sequence length)"
-    y_ids = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.long)
-    lm_labels = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.long)
-    y_ids[:, :] = y[:, :-1]
-    lm_labels[:, :] = y[:, 1:]
-
-    TOKENIZER_PAD_TOKEN_ID = 0
-    NEW_PAD_TOKEN_ID = -100
-    # Shift embedding values from {1, ..., n} to {0, ..., n-1}
-    y_ids[y[:, :-1] != TOKENIZER_PAD_TOKEN_ID] -= 1
-    lm_labels[y[:, 1:] != TOKENIZER_PAD_TOKEN_ID] -= 1
-    # Relabel the pad token ID (i.e. `tokenizer.pad_token_id`) from 0 to -100
-    y_ids[y[:, :-1] == TOKENIZER_PAD_TOKEN_ID] = NEW_PAD_TOKEN_ID
-    lm_labels[y[:, 1:] == TOKENIZER_PAD_TOKEN_ID] = NEW_PAD_TOKEN_ID
-    np.save(os.path.join(NUMPY_DIR, "train_y_ids.npy"), y_ids)
-    np.save(os.path.join(NUMPY_DIR, "train_lm_labels.npy"), lm_labels)
-
-
-def top_level_task():
-    ffconfig = FFConfig()
-    ffmodel = FFModel(ffconfig)
-    model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")
-
-    # Load train data as numpy arrays
-    print("Loading data...")
-    ids = np.load(os.path.join(NUMPY_DIR, "train_source_ids.npy"))
-    mask = np.load(os.path.join(NUMPY_DIR, "train_source_mask.npy"))
-    y_ids = np.load(os.path.join(NUMPY_DIR, "train_y_ids.npy"))
-    lm_labels = np.load(os.path.join(NUMPY_DIR, "train_lm_labels.npy"))
-
-    batch_size = ffconfig.batch_size
-    input_ids_shape = (batch_size, ids.shape[1])
-    attention_mask_shape = (batch_size, mask.shape[1])
-    decoder_input_ids_shape = (batch_size, y_ids.shape[1])
-    input_tensors = [
-        ffmodel.create_tensor(input_ids_shape, DataType.DT_INT64),          # input_ids
-        ffmodel.create_tensor(attention_mask_shape, DataType.DT_INT64),     # attention_mask
-        ffmodel.create_tensor(decoder_input_ids_shape, DataType.DT_INT64),  # decoder_input_ids
-    ]
-    encoder_seq_length = ids.shape[1]
-    decoder_seq_length = y_ids.shape[1]
-    seq_length = (encoder_seq_length, decoder_seq_length)
-    input_names = ["input_ids", "attention_mask", "decoder_input_ids"]
-
-    print("Tracing the model...")
-    hf_model = PyTorchModel(
-        model, is_hf_model=True, input_names=input_names,
-        batch_size=batch_size, seq_length=seq_length,
-    )
-    output_tensors = hf_model.torch_to_ff(ffmodel, input_tensors, verbose=True)
-    ffoptimizer = SGDOptimizer(ffmodel, lr=0.01)
-
-    print("Compiling the model...")
-    ffmodel.compile(
-        optimizer=ffoptimizer,
-        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
-        metrics=[
-            MetricsType.METRICS_ACCURACY,
-            MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY,
-        ],
-    )
-
-    print("Creating data loaders...")
-    input_ids_dl = ffmodel.create_data_loader(input_tensors[0], ids)
-    attention_mask_dl = ffmodel.create_data_loader(input_tensors[1], mask)
-    decoder_input_ids_dl = ffmodel.create_data_loader(input_tensors[2], y_ids)
-    # NOTE: We cast down the label tensor data to 32-bit to accommodate the
-    # label tensor's required dtype
-    labels_dl = ffmodel.create_data_loader(
-        ffmodel.label_tensor, lm_labels.astype("int32")
-    )
-
-    print("Initializing model layers...")
-    ffmodel.init_layers()
-
-    print("Training...")
-    epochs = ffconfig.epochs
-    ffmodel.fit(
-        x=[input_ids_dl, attention_mask_dl, decoder_input_ids_dl],
-        y=labels_dl, batch_size=batch_size, epochs=epochs,
-    )
-
-
-if __name__ == "__main__":
-    # Generate the .tsv files if needed
-    if not os.path.exists(os.path.join(DATA_DIR, "train.tsv")) or \
-            not os.path.exists(os.path.join(DATA_DIR, "eval.tsv")):
-        DataPreparer.data_to_tsv()
-    # Convert the .tsv files to .npy if needed
-    if not os.path.exists(NUMPY_DIR):
-        os.mkdir(NUMPY_DIR)
-    prefixes = ["train_", "eval_"]
-    suffixes = ["source_ids.npy", "source_mask.npy", "target_ids.npy"]
-    npy_filenames = [
-        pre + suf for pre, suf in itertools.product(prefixes, suffixes)
-    ]
-    if any(
-        not os.path.exists(os.path.join(NUMPY_DIR, filename))
-        for filename in npy_filenames
-    ):
-        data_to_numpy()
-    # Preprocess the training data if needed
-    if not os.path.exists(os.path.join(NUMPY_DIR, "train_y_ids.npy")) or \
-            not os.path.exists(os.path.join(NUMPY_DIR, "train_lm_labels.npy")):
-        preprocess_train()
-    top_level_task()
diff --git a/examples/python/pytorch/mt5/mt5_torch.py b/examples/python/pytorch/mt5/mt5_torch.py
deleted file mode 100644
index 78886eed6c..0000000000
--- a/examples/python/pytorch/mt5/mt5_torch.py
+++ /dev/null
@@ -1,315 +0,0 @@
-"""
-Based on:
-https://towardsdatascience.com/how-to-train-an-mt5-model-for-translation-with-simple-transformers-30ba5fa66c5f
-https://shivanandroy.com/fine-tune-t5-transformer-with-pytorch/
-"""
-
-import os
-
-import numpy as np
-import pandas as pd
-import torch
-from torch.utils.data import DataLoader, Dataset
-from transformers import MT5ForConditionalGeneration, T5Tokenizer
-
-BASE_DIR = "examples/python/pytorch/mt5"
-DATA_DIR = os.path.join(BASE_DIR, "data")
-OUTPUT_DIR = os.path.join(BASE_DIR, "output")
-
-
-class DataPreparer():
-    """
-    This class prepares the data -- :meth:`data_to_tsv` should only be called
-    once, and the data can be directly loaded from the .tsv files thereafter.
-    """
-    @staticmethod
-    def prepare_data(data_path):
-        """
-        Returns: train_df, eval_df
-            train_df (pd.DataFrame): Training dataframe.
-            eval_df (pd.DataFrame): Evaluation dataframe.
-        """
-        sinhala_train_filename = os.path.join(data_path, "train.trg")
-        with open(sinhala_train_filename, "r", encoding="utf-8")as f:
-            sinhala_text = f.readlines()
-            sinhala_text = [text.strip("\n") for text in sinhala_text]
-        english_train_filename = os.path.join(data_path, "train.src")
-        with open(english_train_filename, "r") as f:
-            english_text = f.readlines()
-            english_text = [text.strip("\n") for text in english_text]
-
-        data = []
-        for sinhala, english in zip(sinhala_text, english_text):
-            data.append(["translate sinhala to english", sinhala, english])
-            data.append(["translate english to sinhala", english, sinhala])
-        train_df = pd.DataFrame(
-            data, columns=["prefix", "input_text", "target_text"]
-        )
-
-        sinhala_test_filename = os.path.join(data_path, "test.trg")
-        with open(sinhala_test_filename, "r", encoding="utf-8") as f:
-            sinhala_text = f.readlines()
-            sinhala_text = [text.strip("\n") for text in sinhala_text]
-        english_test_filename = os.path.join(data_path, "test.src")
-        with open(english_test_filename, "r") as f:
-            english_text = f.readlines()
-            english_text = [text.strip("\n") for text in english_text]
-
-        data = []
-        for sinhala, english in zip(sinhala_text, english_text):
-            data.append(["translate sinhala to english", sinhala, english])
-            data.append(["translate english to sinhala", english, sinhala])
-        eval_df = pd.DataFrame(
-            data, columns=["prefix", "input_text", "target_text"]
-        )
-
-        return train_df, eval_df
-
-    @staticmethod
-    def data_to_tsv():
-        """Saves the training data and evaluation data to .tsv files."""
-        train_df, eval_df = DataPreparer.prepare_data(
-            os.path.join(DATA_DIR, "eng-sin")
-        )
-        train_df.to_csv(os.path.join(DATA_DIR, "train.tsv"), sep="\t")
-        eval_df.to_csv(os.path.join(DATA_DIR, "eval.tsv"), sep="\t")
-
-
-class SinhaleseDataset(Dataset):
-    def __init__(
-        self, dataframe, tokenizer, source_len, target_len, source_text,
-        target_text,
-    ):
-        """
-        Args:
-            dataframe (pd.DataFrame): Input dataframe.
-            tokenizer (transformers.tokenizer): Transformers tokenizer.
-            source_len (int): Max length of source text.
-            target_len (int): Max length of target text.
-            source_text (str): Column name of source text.
-            target_text (str): Column name of target text.
-        """
-        self.df = dataframe
-        self.tokenizer = tokenizer
-        self.source_len = source_len
-        self.target_len = target_len
-        self.source_text = self.df[source_text]
-        self.target_text = self.df[target_text]
-
-    def __len__(self):
-        """Returns the length of the dataframe."""
-        return len(self.target_text)
-
-    def __getitem__(self, index):
-        """Returns the input IDs, target IDs, and attention masks for the
-        given index."""
-        src_text = str(self.source_text[index])
-        tar_text = str(self.target_text[index])
-        src_text = " ".join(src_text.split())
-        tar_text = " ".join(tar_text.split())
-
-        src = self.tokenizer.batch_encode_plus(
-            [src_text],
-            max_length=self.source_len,
-            pad_to_max_length=True,
-            truncation=True,
-            padding="max_length",
-            return_tensors="pt",
-        )
-        tar = self.tokenizer.batch_encode_plus(
-            [tar_text],
-            max_length=self.target_len,
-            pad_to_max_length=True,
-            truncation=True,
-            padding="max_length",
-            return_tensors="pt",
-        )
-
-        source_ids = src["input_ids"].squeeze()
-        source_mask = src["attention_mask"].squeeze()
-        target_ids = tar["input_ids"].squeeze()
-        target_mask = tar["attention_mask"].squeeze()
-
-        return {
-            "source_ids": source_ids.to(dtype=torch.long),
-            "source_mask": source_mask.to(dtype=torch.long),
-            "target_ids": target_ids.to(dtype=torch.long),
-            "target_ids_y": target_ids.to(dtype=torch.long),
-        }
-
-
-def train(epoch, tokenizer, model, device, loader, optimizer):
-    model.train()
-    for i, data in enumerate(loader, 0):
-        y = data["target_ids"].to(device, dtype=torch.long)
-        y_ids = y[:, :-1].contiguous()
-        lm_labels = y[:, 1:].clone().detach()
-        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
-        ids = data["source_ids"].to(device, dtype=torch.long)
-        mask = data["source_mask"].to(device, dtype=torch.long)
-
-        outputs = model(
-            input_ids=ids,
-            attention_mask=mask,
-            decoder_input_ids=y_ids,
-            labels=lm_labels,
-        )
-        loss = outputs[0]
-        if i % 10 == 0:
-            print(f"Epoch={epoch}\tbatch={i} \tloss={loss:.3f}")
-
-        optimizer.zero_grad()
-        loss.backward()
-        optimizer.step()
-
-
-def eval(epoch, tokenizer, model, device, loader):
-    model.eval()
-    predictions = []
-    actuals = []
-    with torch.no_grad():
-        for i, data in enumerate(loader, 0):
-            y = data["target_ids"].to(device, dtype=torch.long)
-            ids = data["source_ids"].to(device, dtype=torch.long)
-            mask = data["source_mask"].to(device, dtype=torch.long)
-
-            generated_ids = model.generate(
-                input_ids=ids,
-                attention_mask=mask,
-                max_length=150,
-                num_beams=2,
-                repetition_penalty=2.5,
-                length_penalty=1.0,
-                early_stopping=True
-            )
-            preds = [
-                tokenizer.decode(
-                    g, skip_special_tokens=True,
-                    clean_up_tokenization_spaces=True,
-                ) for g in generated_ids
-            ]
-            target = [
-                tokenizer.decode(
-                    t, skip_special_tokens=True,
-                    clean_up_tokenization_spaces=True,
-                )
-                for t in y
-            ]
-            if i % 10 == 0:
-                print(f"Epoch={epoch}\tbatch={i}")
-            predictions.extend(preds)
-            actuals.extend(target)
-    return predictions, actuals
-
-
-def get_dataframes():
-    train_df = pd.read_csv(
-        os.path.join(DATA_DIR, "train.tsv"), sep="\t",
-    ).astype(str)
-    eval_df = pd.read_csv(
-        os.path.join(DATA_DIR, "eval.tsv"), sep="\t",
-    ).astype(str)
-    train_df["prefix"] = ""
-    eval_df["prefix"] = ""
-    return train_df, eval_df
-
-
-def set_seed(model_params):
-    torch.manual_seed(model_params["SEED"])
-    np.random.seed(model_params["SEED"])
-    torch.backends.cudnn.deterministic = True
-
-
-def get_datasets(tokenizer, model_params):
-    train_df, eval_df = get_dataframes()
-    source_text = "input_text"
-    target_text = "target_text"
-    train_set = SinhaleseDataset(
-        train_df,
-        tokenizer,
-        model_params["MAX_SOURCE_TEXT_LENGTH"],
-        model_params["MAX_TARGET_TEXT_LENGTH"],
-        source_text,
-        target_text,
-    )
-    eval_set = SinhaleseDataset(
-        eval_df,
-        tokenizer,
-        model_params["MAX_SOURCE_TEXT_LENGTH"],
-        model_params["MAX_TARGET_TEXT_LENGTH"],
-        source_text,
-        target_text,
-    )
-    return train_set, eval_set
-
-
-def get_dataloaders(tokenizer, model_params):
-    train_set, eval_set = get_datasets(tokenizer, model_params)
-    # Use the full dataset as one batch if the given batch size is `None`
-    train_batch_size = model_params["TRAIN_BATCH_SIZE"]
-    if train_batch_size is None:
-        train_batch_size = len(train_set)
-    eval_batch_size = model_params["EVAL_BATCH_SIZE"]
-    if eval_batch_size is None:
-        eval_batch_size = len(eval_set)
-    train_params = {
-        "batch_size": train_batch_size,
-        "shuffle": True,
-        "num_workers": 0,
-    }
-    eval_params = {
-        "batch_size": eval_batch_size,
-        "shuffle": False,
-        "num_workers": 0,
-    }
-    train_loader = DataLoader(train_set, **train_params)
-    eval_loader = DataLoader(eval_set, **eval_params)
-    return train_loader, eval_loader
-
-
-def TorchMT5Trainer(
-    model_params,
-    device,
-    output_dir=OUTPUT_DIR,
-):
-    set_seed(model_params)
-
-    tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])
-    model = MT5ForConditionalGeneration.from_pretrained(model_params["MODEL"])
-    model = model.to(device)
-
-    print("Reading data...")
-    train_loader, eval_loader = get_dataloaders(tokenizer, model_params)
-    optimizer = torch.optim.SGD(
-        params=model.parameters(), lr=model_params["LEARNING_RATE"],
-    )
-
-    print("Training...")
-    for epoch in range(1, model_params["TRAIN_EPOCHS"] + 1):
-        train(epoch, tokenizer, model, device, train_loader, optimizer)
-
-    print("Evaluating...")
-    predictions, actuals = eval(0, tokenizer, model, device, eval_loader)
-    output_df = pd.DataFrame({"Predictions": predictions, "Actuals": actuals})
-    if not os.path.exists(output_dir):
-        os.mkdir(output_dir)
-    output_df.to_csv(output_dir)
-
-
-if __name__ == "__main__":
-    if not os.path.exists(os.path.join(DATA_DIR, "train.tsv")) or \
-            not os.path.exists(os.path.join(DATA_DIR, "eval.tsv")):
-        DataPreparer.data_to_tsv()
-
-    model_params = {
-        "SEED": 42,
-        "MODEL": "google/mt5-small",
-        "TRAIN_BATCH_SIZE": 32,
-        "EVAL_BATCH_SIZE": 32,
-        "TRAIN_EPOCHS": 2,
-        "MAX_SOURCE_TEXT_LENGTH": 48,
-        "MAX_TARGET_TEXT_LENGTH": 48,
-        "LEARNING_RATE": 1e-4,
-    }
-    device = torch.device(0)
-    TorchMT5Trainer(model_params, device)
diff --git a/examples/python/pytorch/regnet.py b/examples/python/pytorch/regnet.py
deleted file mode 100644
index 07cc1ad0c8..0000000000
--- a/examples/python/pytorch/regnet.py
+++ /dev/null
@@ -1,72 +0,0 @@
-from flexflow.core import *
-from flexflow.keras.datasets import cifar10
-from flexflow.torch.model import PyTorchModel
-import os
-import numpy as np
-
-#from accuracy import ModelAccuracy
-from PIL import Image
-
-def top_level_task():
-  ffconfig = FFConfig()
-  alexnetconfig = NetConfig()
-  print(alexnetconfig.dataset_path)
-  print("Python API batchSize(%d) workersPerNodes(%d) numNodes(%d)" %(ffconfig.batch_size, ffconfig.workers_per_node, ffconfig.num_nodes))
-  ffmodel = FFModel(ffconfig)
-  
-  dims_input = [ffconfig.batch_size, 3, 229, 229]
-  input = ffmodel.create_tensor(dims_input, DataType.DT_FLOAT)
-
-  output_tensors = PyTorchModel.file_to_ff("regnetX32gf.ff", ffmodel, [input])
-  t = ffmodel.softmax(output_tensors[0])
-
-  ffoptimizer = SGDOptimizer(ffmodel, 0.01)
-  ffmodel.optimizer = ffoptimizer
-  ffmodel.compile(loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
-  label = ffmodel.label_tensor
-  
-  num_samples = 10000
-  
-  (x_train, y_train), (x_test, y_test) = cifar10.load_data(num_samples)
-
-  full_input_np = np.zeros((num_samples, 3, 229, 229), dtype=np.float32)
-  
-  for i in range(0, num_samples):
-    image = x_train[i, :, :, :]
-    image = image.transpose(1, 2, 0)
-    pil_image = Image.fromarray(image)
-    pil_image = pil_image.resize((229,229), Image.NEAREST)
-    image = np.array(pil_image, dtype=np.float32)
-    image = image.transpose(2, 0, 1)
-    full_input_np[i, :, :, :] = image
-
-  full_input_np /= 255
-  
-  y_train = y_train.astype('int32')
-  full_label_np = y_train
-  
-  dataloader_input = ffmodel.create_data_loader(input, full_input_np)
-  dataloader_label = ffmodel.create_data_loader(label, full_label_np)
-  
-  num_samples = dataloader_input.num_samples
-  assert dataloader_input.num_samples == dataloader_label.num_samples
-
-  ffmodel.init_layers()
-
-  epochs = ffconfig.epochs
-
-  ts_start = ffconfig.get_current_time()
-
-  ffmodel.fit(x=dataloader_input, y=dataloader_label, epochs=epochs)
-
-  ts_end = ffconfig.get_current_time()
-  run_time = 1e-6 * (ts_end - ts_start);
-  print("epochs %d, ELAPSED TIME = %.4fs, THROUGHPUT = %.2f samples/s\n" %(epochs, run_time, num_samples * epochs / run_time));
-  # perf_metrics = ffmodel.get_perf_metrics()
-  # accuracy = perf_metrics.get_accuracy()
-  # if accuracy < ModelAccuracy.CIFAR10_ALEXNET.value:
-  #   assert 0, 'Check Accuracy'
-
-if __name__ == "__main__":
-  print("regnetX32gf torch")
-  top_level_task()
diff --git a/examples/python/pytorch/resnet.py b/examples/python/pytorch/resnet.py
deleted file mode 100644
index 383a21a532..0000000000
--- a/examples/python/pytorch/resnet.py
+++ /dev/null
@@ -1,72 +0,0 @@
-from flexflow.core import *
-from flexflow.keras.datasets import cifar10
-from flexflow.torch.model import PyTorchModel
-
-#from accuracy import ModelAccuracy
-from PIL import Image
-import numpy as np
-
-def top_level_task():
-  ffconfig = FFConfig()
-  alexnetconfig = NetConfig()
-  print(alexnetconfig.dataset_path)
-  print("Python API batchSize(%d) workersPerNodes(%d) numNodes(%d)" %(ffconfig.batch_size, ffconfig.workers_per_node, ffconfig.num_nodes))
-  ffmodel = FFModel(ffconfig)
-  
-  dims_input = [ffconfig.batch_size, 3, 224, 224]
-  input = ffmodel.create_tensor(dims_input, DataType.DT_FLOAT)
-
-  output_tensors = PyTorchModel.file_to_ff("resnet18.ff", ffmodel, [input])
-  t = ffmodel.softmax(output_tensors[0])
-
-  ffoptimizer = SGDOptimizer(ffmodel, 0.01)
-  ffmodel.optimizer = ffoptimizer
-  ffmodel.compile(loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
-  label = ffmodel.label_tensor
-  
-  num_samples = 10000
-  
-  (x_train, y_train), (x_test, y_test) = cifar10.load_data(num_samples)
-
-  full_input_np = np.zeros((num_samples, 3, 224, 224), dtype=np.float32)
-  
-  for i in range(0, num_samples):
-    image = x_train[i, :, :, :]
-    image = image.transpose(1, 2, 0)
-    pil_image = Image.fromarray(image)
-    pil_image = pil_image.resize((224,224), Image.NEAREST)
-    image = np.array(pil_image, dtype=np.float32)
-    image = image.transpose(2, 0, 1)
-    full_input_np[i, :, :, :] = image
-
-  full_input_np /= 255
-  
-  y_train = y_train.astype('int32')
-  full_label_np = y_train
-  
-  dataloader_input = ffmodel.create_data_loader(input, full_input_np)
-  dataloader_label = ffmodel.create_data_loader(label, full_label_np)
-  
-  num_samples = dataloader_input.num_samples
-  assert dataloader_input.num_samples == dataloader_label.num_samples
-
-  ffmodel.init_layers()
-
-  epochs = ffconfig.epochs
-
-  ts_start = ffconfig.get_current_time()
-
-  #ffmodel.fit(x=dataloader_input, y=dataloader_label, epochs=epochs)
-  ffmodel.eval(x=dataloader_input, y=dataloader_label)
-
-  ts_end = ffconfig.get_current_time()
-  run_time = 1e-6 * (ts_end - ts_start);
-  print("epochs %d, ELAPSED TIME = %.4fs, THROUGHPUT = %.2f samples/s\n" %(epochs, run_time, num_samples * epochs / run_time));
-  # perf_metrics = ffmodel.get_perf_metrics()
-  # accuracy = perf_metrics.get_accuracy()
-  # if accuracy < ModelAccuracy.CIFAR10_ALEXNET.value:
-  #   assert 0, 'Check Accuracy'
-
-if __name__ == "__main__":
-  print("resnet torch")
-  top_level_task()
diff --git a/examples/python/pytorch/resnet152_DDP_training.py b/examples/python/pytorch/resnet152_DDP_training.py
deleted file mode 100644
index ad1e7f4398..0000000000
--- a/examples/python/pytorch/resnet152_DDP_training.py
+++ /dev/null
@@ -1,110 +0,0 @@
-import os
-import tempfile
-import torch,torchvision
-import torch.distributed as dist
-import torch.nn as nn
-import torch.optim as optim
-import argparse
-import torch.multiprocessing as mp
-import torchvision.transforms as transforms
-import torchvision.models as models
-
-import time
-import pandas as pd
-
-
-def setup(rank, world_size):
-    print("Running init_process_group...")
-    dist.init_process_group("nccl", rank=rank, world_size=world_size)
-    print("Finished init_process_group...")
-
-def cleanup():
-    dist.destroy_process_group()
-
-def train(gpu, args):
-    rank = args.nr * args.gpus + gpu	
-    setup(rank, args.world_size)
-    transform = transforms.Compose([
-                torchvision.transforms.Resize(224),
-                transforms.RandomHorizontalFlip(),
-                transforms.ToTensor(),
-                transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225])
-            ])
-    batch_size = args.batchsize
-    train_dataset = torchvision.datasets.CIFAR10('./datasets/',transform=transform,download=True)
-    sampler = torch.utils.data.distributed.DistributedSampler(train_dataset,num_replicas=args.world_size,rank=rank)
-    trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, num_workers=2,sampler=sampler)
-
-    model = models.resnet152()
-    torch.cuda.set_device(gpu)
-    model.cuda()
-    print("GPU initialization")
-    dummy_input = torch.randn(1, 3,224,224, dtype=torch.float).to(gpu)
-    for _ in range(10):
-        _ = model(dummy_input)
-    model = nn.parallel.DistributedDataParallel(model,device_ids=[gpu])
-
-    criterion = nn.CrossEntropyLoss().cuda()
-    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
-    training_run_data = pd.DataFrame(columns=['epoch','batch','batch_size','gpu_number','time'])
-    starter, ender = torch.cuda.Event(enable_timing=True),torch.cuda.Event(enable_timing=True)
-    for epoch in range(args.epochs):  # loop over the dataset multiple times
-        running_loss = 0.0
-        print("Epoch %d"%epoch)
-        sampler.set_epoch(epoch)
-        for i, data in enumerate(trainloader, 0):
-            starter.record()
-            inputs, labels = data
-            inputs = inputs.cuda()
-            labels = labels.cuda()
-
-            optimizer.zero_grad()
-            outputs = model(inputs)
-            loss = criterion(outputs, labels)
-            loss.backward()
-            optimizer.step()
-            ender.record()
-            # print statistics
-            if rank==0:
-                torch.cuda.synchronize()
-                timer = starter.elapsed_time(ender)
-                training_run_data=training_run_data.append(
-                        {'epoch':epoch, 'batch':i,'loss':loss.item(),'batch_size':batch_size,'gpu_number':args.gpus*args.nodes,'time (ms)':timer/(batch_size*args.gpus),'throughput':1000*(batch_size*args.gpus)/timer},
-                    ignore_index=True)
-                training_run_data.to_csv("training_stats_GPU_%.0f_batchsize_%.0f.csv"%(args.gpus*args.nodes,batch_size),index=False)
-                print("[Epoch %d] Batch: %d Loss: %.3f Time per Image: %.2f msi Throughput:%.2f"%
-                (epoch,i,loss.item(),timer/(batch_size*args.gpus),1000*(batch_size*args.gpus)/timer))
-
-                running_loss += loss.item()
-                if i % 2000 == 1999:    # print every 2000 mini-batches
-                    print('[%d, %5d] loss: %.3f' %
-                        (epoch + 1, i + 1, running_loss / 2000))
-                    running_loss = 0.0
-    cleanup()
-
-def main():
-    print("Parsing arguments...")
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-n', '--nodes', default=1,
-                        type=int, metavar='N')
-    parser.add_argument('-g', '--gpus', default=1, type=int,
-                        help='number of gpus per node')
-    parser.add_argument('--epochs', default=2, type=int, 
-                        metavar='N',
-                        help='number of epochs')
-    parser.add_argument('-b','--batchsize', default=12, type=int) 
-    args = parser.parse_args()
-    if 'SLURMD_NODENAME' in os.environ:
-        if os.environ['SLURMD_NODENAME']==os.environ['MASTER_ADDR']:     
-            args.nr=0
-        else:
-            args.nr=1
-    else:
-        args.nr=0
-    args.world_size = args.gpus * args.nodes   
-    print("Spawning processes...")       
-    mp.spawn(train, nprocs=args.gpus, args=(args,))
-
-if __name__=='__main__':
-    main()
-    
diff --git a/examples/python/pytorch/resnet152_training.py b/examples/python/pytorch/resnet152_training.py
deleted file mode 100644
index a76a8815ed..0000000000
--- a/examples/python/pytorch/resnet152_training.py
+++ /dev/null
@@ -1,58 +0,0 @@
-from resnet_torch import resnet152
-import torch,torchvision
-import torchvision.transforms as transforms
-import torchvision.models as models
-import torch.optim as optim
-import torch.nn as nn
-import time
-
-device = "cuda:0"
-batch_size = 4
-normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
-                                     std=[0.229, 0.224, 0.225])
-transform = transforms.Compose([
-            torchvision.transforms.Resize(224),
-            transforms.RandomHorizontalFlip(),
-            transforms.ToTensor(),
-            normalize,
-        ])
-
-train_dataset = torchvision.datasets.CIFAR10('./datasets/',transform=transform,download=True)
-trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size,shuffle=True, num_workers=2)
-
-
-model = models.resnet152()
-torch.cuda.set_device(device)
-model.cuda()
-print("Setting optimizer")
-
-criterion = nn.CrossEntropyLoss()
-optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
-
-print("Starting training")
-for epoch in range(2):  # loop over the dataset multiple times
-    running_loss = 0.0
-    print("Epoch %d"%epoch)
-    for i, data in enumerate(trainloader, 0):
-        start = time.time()
-        inputs, labels = data
-        inputs = inputs.cuda()
-        labels = labels.cuda()
-
-        optimizer.zero_grad()
-        outputs = model(inputs)
-        loss = criterion(outputs, labels)
-        loss.backward()
-        optimizer.step()
-        end = time.time()
-
-        # print statistics
-        print("[Epoch %d] Batch: %d Loss: %.3f Time per Image: %.5f"%(epoch,i,loss.item(),(end - start)/batch_size))
-
-        running_loss += loss.item()
-        if i % 2000 == 1999:    # print every 2000 mini-batches
-            print('[%d, %5d] loss: %.3f' %
-                  (epoch + 1, i + 1, running_loss / 2000))
-            running_loss = 0.0
-
-print('Finished Training')
\ No newline at end of file
diff --git a/examples/python/pytorch/resnet_torch.py b/examples/python/pytorch/resnet_torch.py
deleted file mode 100644
index 67d6533b1b..0000000000
--- a/examples/python/pytorch/resnet_torch.py
+++ /dev/null
@@ -1,372 +0,0 @@
-import torch
-from torch import Tensor
-import torch.nn as nn
-from typing import Type, Any, Callable, Union, List, Optional
-from flexflow.torch.model import PyTorchModel
-
-def conv3x3(in_planes: int, out_planes: int, stride: int = 1, groups: int = 1, dilation: int = 1) -> nn.Conv2d:
-    """3x3 convolution with padding"""
-    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
-                     padding=dilation, groups=groups, bias=False, dilation=dilation)
-
-
-def conv1x1(in_planes: int, out_planes: int, stride: int = 1) -> nn.Conv2d:
-    """1x1 convolution"""
-    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
-
-
-class BasicBlock(nn.Module):
-    expansion: int = 1
-
-    def __init__(
-        self,
-        inplanes: int,
-        planes: int,
-        stride: int = 1,
-        downsample: Optional[nn.Module] = None,
-        groups: int = 1,
-        base_width: int = 64,
-        dilation: int = 1,
-        norm_layer: Optional[Callable[..., nn.Module]] = None
-    ) -> None:
-        super(BasicBlock, self).__init__()
-        if norm_layer is None:
-            norm_layer = nn.BatchNorm2d
-        if groups != 1 or base_width != 64:
-            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
-        if dilation > 1:
-            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
-        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
-        self.conv1 = conv3x3(inplanes, planes, stride)
-        self.bn1 = norm_layer(planes)
-        self.relu = nn.ReLU(inplace=True)
-        self.conv2 = conv3x3(planes, planes)
-        self.bn2 = norm_layer(planes)
-        self.downsample = downsample
-        self.stride = stride
-
-    def forward(self, x: Tensor) -> Tensor:
-        identity = x
-
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.relu(out)
-
-        out = self.conv2(out)
-        out = self.bn2(out)
-
-        if self.downsample is not None:
-            identity = self.downsample(x)
-
-        out += identity
-        out = self.relu(out)
-
-        return out
-
-
-class Bottleneck(nn.Module):
-    # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
-    # while original implementation places the stride at the first 1x1 convolution(self.conv1)
-    # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
-    # This variant is also known as ResNet V1.5 and improves accuracy according to
-    # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
-
-    expansion: int = 4
-
-    def __init__(
-        self,
-        inplanes: int,
-        planes: int,
-        stride: int = 1,
-        downsample: Optional[nn.Module] = None,
-        groups: int = 1,
-        base_width: int = 64,
-        dilation: int = 1,
-        norm_layer: Optional[Callable[..., nn.Module]] = None
-    ) -> None:
-        super(Bottleneck, self).__init__()
-        if norm_layer is None:
-            norm_layer = nn.BatchNorm2d
-        width = int(planes * (base_width / 64.)) * groups
-        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
-        self.conv1 = conv1x1(inplanes, width)
-        self.bn1 = norm_layer(width)
-        self.conv2 = conv3x3(width, width, stride, groups, dilation)
-        self.bn2 = norm_layer(width)
-        self.conv3 = conv1x1(width, planes * self.expansion)
-        self.bn3 = norm_layer(planes * self.expansion)
-        self.relu = nn.ReLU(inplace=True)
-        self.downsample = downsample
-        self.stride = stride
-
-    def forward(self, x: Tensor) -> Tensor:
-        identity = x
-
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.relu(out)
-
-        out = self.conv2(out)
-        out = self.bn2(out)
-        out = self.relu(out)
-
-        out = self.conv3(out)
-        out = self.bn3(out)
-
-        if self.downsample is not None:
-            identity = self.downsample(x)
-
-        out += identity
-        out = self.relu(out)
-
-        return out
-
-
-class ResNet(nn.Module):
-
-    def __init__(
-        self,
-        block: Type[Union[BasicBlock, Bottleneck]],
-        layers: List[int],
-        num_classes: int = 1000,
-        zero_init_residual: bool = False,
-        groups: int = 1,
-        width_per_group: int = 64,
-        replace_stride_with_dilation: Optional[List[bool]] = None,
-        norm_layer: Optional[Callable[..., nn.Module]] = None
-    ) -> None:
-        super(ResNet, self).__init__()
-        if norm_layer is None:
-            norm_layer = nn.BatchNorm2d
-        self._norm_layer = norm_layer
-
-        self.inplanes = 64
-        self.dilation = 1
-        if replace_stride_with_dilation is None:
-            # each element in the tuple indicates if we should replace
-            # the 2x2 stride with a dilated convolution instead
-            replace_stride_with_dilation = [False, False, False]
-        if len(replace_stride_with_dilation) != 3:
-            raise ValueError("replace_stride_with_dilation should be None "
-                             "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
-        self.groups = groups
-        self.base_width = width_per_group
-        self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3,
-                               bias=False)
-        self.bn1 = norm_layer(self.inplanes)
-        self.relu = nn.ReLU(inplace=True)
-        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
-        self.layer1 = self._make_layer(block, 64, layers[0])
-        self.layer2 = self._make_layer(block, 128, layers[1], stride=2,
-                                       dilate=replace_stride_with_dilation[0])
-        self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
-                                       dilate=replace_stride_with_dilation[1])
-        self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
-                                       dilate=replace_stride_with_dilation[2])
-        self.avgpool = nn.AvgPool2d(7, stride=1)
-        self.fc = nn.Linear(512 * block.expansion, num_classes)
-
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
-            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
-                nn.init.constant_(m.weight, 1)
-                nn.init.constant_(m.bias, 0)
-
-        # Zero-initialize the last BN in each residual branch,
-        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
-        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
-        if zero_init_residual:
-            for m in self.modules():
-                if isinstance(m, Bottleneck):
-                    nn.init.constant_(m.bn3.weight, 0)  # type: ignore[arg-type]
-                elif isinstance(m, BasicBlock):
-                    nn.init.constant_(m.bn2.weight, 0)  # type: ignore[arg-type]
-
-    def _make_layer(self, block: Type[Union[BasicBlock, Bottleneck]], planes: int, blocks: int,
-                    stride: int = 1, dilate: bool = False) -> nn.Sequential:
-        norm_layer = self._norm_layer
-        downsample = None
-        previous_dilation = self.dilation
-        if dilate:
-            self.dilation *= stride
-            stride = 1
-        if stride != 1 or self.inplanes != planes * block.expansion:
-            downsample = nn.Sequential(
-                conv1x1(self.inplanes, planes * block.expansion, stride),
-                norm_layer(planes * block.expansion),
-            )
-
-        layers = []
-        layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
-                            self.base_width, previous_dilation, norm_layer))
-        self.inplanes = planes * block.expansion
-        for _ in range(1, blocks):
-            layers.append(block(self.inplanes, planes, groups=self.groups,
-                                base_width=self.base_width, dilation=self.dilation,
-                                norm_layer=norm_layer))
-
-        return nn.Sequential(*layers)
-
-    def _forward_impl(self, x: Tensor) -> Tensor:
-        # See note [TorchScript super()]
-        x = self.conv1(x)
-        x = self.bn1(x)
-        x = self.relu(x)
-        x = self.maxpool(x)
-
-        x = self.layer1(x)
-        x = self.layer2(x)
-        x = self.layer3(x)
-        x = self.layer4(x)
-
-        x = self.avgpool(x)
-        x = torch.flatten(x, 1)
-        x = self.fc(x)
-
-        return x
-
-    def forward(self, x: Tensor) -> Tensor:
-        return self._forward_impl(x)
-
-
-def _resnet(
-    arch: str,
-    block: Type[Union[BasicBlock, Bottleneck]],
-    layers: List[int],
-    pretrained: bool,
-    progress: bool,
-    **kwargs: Any
-) -> ResNet:
-    model = ResNet(block, layers, **kwargs)
-    return model
-
-
-def resnet18(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
-    r"""ResNet-18 model from
-    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-        progress (bool): If True, displays a progress bar of the download to stderr
-    """
-    return _resnet('resnet18', BasicBlock, [2, 2, 2, 2], pretrained, progress,
-                   **kwargs)
-
-
-def resnet34(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
-    r"""ResNet-34 model from
-    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-        progress (bool): If True, displays a progress bar of the download to stderr
-    """
-    return _resnet('resnet34', BasicBlock, [3, 4, 6, 3], pretrained, progress,
-                   **kwargs)
-
-
-def resnet50(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
-    r"""ResNet-50 model from
-    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-        progress (bool): If True, displays a progress bar of the download to stderr
-    """
-    return _resnet('resnet50', Bottleneck, [3, 4, 6, 3], pretrained, progress,
-                   **kwargs)
-
-
-def resnet101(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
-    r"""ResNet-101 model from
-    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-        progress (bool): If True, displays a progress bar of the download to stderr
-    """
-    return _resnet('resnet101', Bottleneck, [3, 4, 23, 3], pretrained, progress,
-                   **kwargs)
-
-
-def resnet152(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
-    r"""ResNet-152 model from
-    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-        progress (bool): If True, displays a progress bar of the download to stderr
-    """
-    return _resnet('resnet152', Bottleneck, [3, 8, 36, 3], pretrained, progress,
-                   **kwargs)
-
-
-def resnext50_32x4d(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
-    r"""ResNeXt-50 32x4d model from
-    `"Aggregated Residual Transformation for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-        progress (bool): If True, displays a progress bar of the download to stderr
-    """
-    kwargs['groups'] = 32
-    kwargs['width_per_group'] = 4
-    return _resnet('resnext50_32x4d', Bottleneck, [3, 4, 6, 3],
-                   pretrained, progress, **kwargs)
-
-
-def resnext101_32x8d(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
-    r"""ResNeXt-101 32x8d model from
-    `"Aggregated Residual Transformation for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-        progress (bool): If True, displays a progress bar of the download to stderr
-    """
-    kwargs['groups'] = 32
-    kwargs['width_per_group'] = 8
-    return _resnet('resnext101_32x8d', Bottleneck, [3, 4, 23, 3],
-                   pretrained, progress, **kwargs)
-
-
-def wide_resnet50_2(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
-    r"""Wide ResNet-50-2 model from
-    `"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_.
-
-    The model is the same as ResNet except for the bottleneck number of channels
-    which is twice larger in every block. The number of channels in outer 1x1
-    convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
-    channels, and in Wide ResNet-50-2 has 2048-1024-2048.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-        progress (bool): If True, displays a progress bar of the download to stderr
-    """
-    kwargs['width_per_group'] = 64 * 2
-    return _resnet('wide_resnet50_2', Bottleneck, [3, 4, 6, 3],
-                   pretrained, progress, **kwargs)
-
-
-def wide_resnet101_2(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
-    r"""Wide ResNet-101-2 model from
-    `"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_.
-
-    The model is the same as ResNet except for the bottleneck number of channels
-    which is twice larger in every block. The number of channels in outer 1x1
-    convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
-    channels, and in Wide ResNet-50-2 has 2048-1024-2048.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-        progress (bool): If True, displays a progress bar of the download to stderr
-    """
-    kwargs['width_per_group'] = 64 * 2
-    return _resnet('wide_resnet101_2', Bottleneck, [3, 4, 23, 3],
-                   pretrained, progress, **kwargs)
-
-
-input = torch.randn(64, 3, 224, 224)
-model = resnet18()
-ff_torch_model = PyTorchModel(model)
-ff_torch_model.torch_to_file("resnet.ff")
\ No newline at end of file
diff --git a/examples/python/pytorch/torch_vision.py b/examples/python/pytorch/torch_vision.py
deleted file mode 100644
index ea4e12b751..0000000000
--- a/examples/python/pytorch/torch_vision.py
+++ /dev/null
@@ -1,69 +0,0 @@
-from flexflow.core import *
-from flexflow.keras.datasets import cifar10
-from flexflow.torch.model import PyTorchModel
-
-from PIL import Image
-import numpy as np
-
-def top_level_task():
-  ffconfig = FFConfig()
-  alexnetconfig = NetConfig()
-  print(alexnetconfig.dataset_path)
-  print("Python API batchSize(%d) workersPerNodes(%d) numNodes(%d)" %(ffconfig.batch_size, ffconfig.workers_per_node, ffconfig.num_nodes))
-  ffmodel = FFModel(ffconfig)
-  
-  dims_input = [ffconfig.batch_size, 3, 229, 229]
-  input_tensor = ffmodel.create_tensor(dims_input, DataType.DT_FLOAT)
-
-  output_tensors = PyTorchModel.file_to_ff("squeezenet.ff", ffmodel, [input_tensor])
-
-  ffoptimizer = SGDOptimizer(ffmodel, 0.01)
-  ffmodel.optimizer = ffoptimizer
-  ffmodel.compile(loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
-  label = ffmodel.label_tensor
-  
-  num_samples = 10000
-  
-  (x_train, y_train), (x_test, y_test) = cifar10.load_data(num_samples)
-
-  full_input_np = np.zeros((num_samples, 3, 229, 229), dtype=np.float32)
-  
-  for i in range(0, num_samples):
-    image = x_train[i, :, :, :]
-    image = image.transpose(1, 2, 0)
-    pil_image = Image.fromarray(image)
-    pil_image = pil_image.resize((229,229), Image.NEAREST)
-    image = np.array(pil_image, dtype=np.float32)
-    image = image.transpose(2, 0, 1)
-    full_input_np[i, :, :, :] = image
-  
-  full_input_np /= 255
-  
-  y_train = y_train.astype('int32')
-  full_label_np = y_train
-  
-  dataloader_input = ffmodel.create_data_loader(input_tensor, full_input_np)
-  dataloader_label = ffmodel.create_data_loader(label, full_label_np)
-  
-  num_samples = dataloader_input.num_samples
-  assert dataloader_input.num_samples == dataloader_label.num_samples
-
-  ffmodel.init_layers()
-
-  epochs = ffconfig.epochs
-
-  ts_start = ffconfig.get_current_time()
-  
-  ffmodel.fit(x=dataloader_input, y=dataloader_label, epochs=epochs)
-
-  ts_end = ffconfig.get_current_time()
-  run_time = 1e-6 * (ts_end - ts_start);
-  print("epochs %d, ELAPSED TIME = %.4fs, THROUGHPUT = %.2f samples/s\n" %(epochs, run_time, num_samples * epochs / run_time));
-  # perf_metrics = ffmodel.get_perf_metrics()
-  # accuracy = perf_metrics.get_accuracy()
-  # if accuracy < ModelAccuracy.CIFAR10_ALEXNET.value:
-  #   assert 0, 'Check Accuracy'
-
-if __name__ == "__main__":
-  print("alexnet onnx")
-  top_level_task()
diff --git a/examples/python/pytorch/torch_vision_torch.py b/examples/python/pytorch/torch_vision_torch.py
deleted file mode 100644
index e7e617de70..0000000000
--- a/examples/python/pytorch/torch_vision_torch.py
+++ /dev/null
@@ -1,21 +0,0 @@
-import torch.nn as nn
-import torchvision.models as models
-from flexflow.torch.model import PyTorchModel
-
-# model = models.alexnet()
-
-# model = models.vgg16()
-
-# model = models.squeezenet1_0()
-
-# model = models.densenet161()
-
-# model = models.inception_v3()
-
-model = models.googlenet()
-
-# model = models.shufflenet_v2_x1_0()
-
-# model = models.mobilenet_v2()
-ff_torch_model = PyTorchModel(model)
-ff_torch_model.torch_to_file("googlenet.ff")
\ No newline at end of file
diff --git a/flake.lock b/flake.lock
index 87fae7f446..8482ad1ba4 100644
--- a/flake.lock
+++ b/flake.lock
@@ -18,6 +18,29 @@
         "type": "github"
       }
     },
+    "nixGL": {
+      "inputs": {
+        "flake-utils": [
+          "flake-utils"
+        ],
+        "nixpkgs": [
+          "nixpkgs"
+        ]
+      },
+      "locked": {
+        "lastModified": 1713543440,
+        "narHash": "sha256-lnzZQYG0+EXl/6NkGpyIz+FEOc/DSEG57AP1VsdeNrM=",
+        "owner": "nix-community",
+        "repo": "nixGL",
+        "rev": "310f8e49a149e4c9ea52f1adf70cdc768ec53f8a",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-community",
+        "repo": "nixGL",
+        "type": "github"
+      }
+    },
     "nixpkgs": {
       "locked": {
         "lastModified": 1710162809,
@@ -43,11 +66,11 @@
         ]
       },
       "locked": {
-        "lastModified": 1728341842,
-        "narHash": "sha256-XMS52KBSS6z3k2VaiVcHyZQD6b2QUm1wIvTClel4xwg=",
+        "lastModified": 1737340566,
+        "narHash": "sha256-fIRXCPLjOZ7QweKH6GanI4BjMotMdp4M9r6/V3l/eXo=",
         "owner": "lockshaw",
         "repo": "proj",
-        "rev": "830fb5b1a0c7087752693990e90bbbf021168dfe",
+        "rev": "2394e855baeea5757c75eadde748902988db5509",
         "type": "github"
       },
       "original": {
@@ -59,6 +82,7 @@
     "root": {
       "inputs": {
         "flake-utils": "flake-utils",
+        "nixGL": "nixGL",
         "nixpkgs": "nixpkgs",
         "proj-repo": "proj-repo"
       }
diff --git a/flake.nix b/flake.nix
index afbc2c1e37..91651bd0c1 100644
--- a/flake.nix
+++ b/flake.nix
@@ -22,9 +22,15 @@
       inputs.nixpkgs.follows = "nixpkgs";
       inputs.flake-utils.follows = "flake-utils";
     };
+
+    nixGL = {
+      url = "github:nix-community/nixGL";
+      inputs.nixpkgs.follows = "nixpkgs";
+      inputs.flake-utils.follows = "flake-utils";
+    };
   };
 
-  outputs = { self, nixpkgs, flake-utils, proj-repo, ... }: flake-utils.lib.eachSystem [ "x86_64-linux" ] (system: 
+  outputs = { self, nixpkgs, flake-utils, proj-repo, nixGL, ... }: flake-utils.lib.eachSystem [ "x86_64-linux" ] (system: 
     let 
       pkgs = import nixpkgs {
         inherit system;
@@ -35,10 +41,13 @@
       mkShell = pkgs.mkShell.override {
         stdenv = pkgs.cudaPackages.backendStdenv;
       };
+
+      proj = proj-repo.packages.${system}.proj;
     in 
     {
       packages = {
         legion = pkgs.callPackage ./.flake/pkgs/legion.nix { };
+        ffdb = pkgs.callPackage ./.flake/pkgs/ffdb { inherit proj; };
         hpp2plantuml = pkgs.python3Packages.callPackage ./.flake/pkgs/hpp2plantuml.nix { };
         rapidcheckFull = pkgs.symlinkJoin {
           name = "rapidcheckFull";
@@ -62,24 +71,20 @@
         ci = mkShell {
           shellHook = ''
             export PATH="$HOME/ff/.scripts/:$PATH"
+            export RC_PARAMS="max_discard_ratio=100"
+            export CMAKE_FLAGS="-DFF_USE_EXTERNAL_LEGION=ON \
+                                -DFF_USE_EXTERNAL_NCCL=ON \
+                                -DFF_USE_EXTERNAL_JSON=ON \
+                                -DFF_USE_EXTERNAL_FMT=ON \
+                                -DFF_USE_EXTERNAL_SPDLOG=ON \
+                                -DFF_USE_EXTERNAL_DOCTEST=ON \
+                                -DFF_USE_EXTERNAL_RAPIDCHECK=ON \
+                                -DFF_USE_EXTERNAL_EXPECTED=ON \
+                                -DFF_USE_EXTERNAL_RANGEV3=ON \
+                                -DFF_USE_EXTERNAL_BOOST_PREPROCESSOR=ON \
+                                -DFF_USE_EXTERNAL_TYPE_INDEX=ON"
           '';
           
-          CMAKE_FLAGS = lib.strings.concatStringsSep " " [
-            "-DFF_USE_EXTERNAL_LEGION=ON"
-            "-DFF_USE_EXTERNAL_NCCL=ON"
-            "-DFF_USE_EXTERNAL_JSON=ON"
-            "-DFF_USE_EXTERNAL_FMT=ON"
-            "-DFF_USE_EXTERNAL_SPDLOG=ON"
-            "-DFF_USE_EXTERNAL_DOCTEST=ON"
-            "-DFF_USE_EXTERNAL_RAPIDCHECK=ON"
-            "-DFF_USE_EXTERNAL_EXPECTED=ON"
-            "-DFF_USE_EXTERNAL_RANGEV3=ON"
-            "-DFF_USE_EXTERNAL_BOOST_PREPROCESSOR=ON"
-            "-DFF_USE_EXTERNAL_TYPE_INDEX=ON"
-          ];
-
-          RC_PARAMS = "max_discard_ratio=100";
-
           buildInputs = builtins.concatLists [
             (with pkgs; [
               zlib
@@ -101,22 +106,30 @@
               tl-expected
               doxygen
               lcov # for code coverage
+              compdb
             ])
             (with proj-repo.packages.${system}; [
               proj
             ])
             (with self.packages.${system}; [
               legion
-              hpp2plantuml
               rapidcheckFull
               doctest
             ])
           ];
         };
 
+        gpu-ci = mkShell {
+          inputsFrom = [ ci ];
+          buildInputs = builtins.concatLists [
+            (with nixGL.packages.${system}; [
+              nixGLDefault
+            ])
+          ];
+        };
+
         default = mkShell {
           inputsFrom = [ ci ];
-          inherit (ci) CMAKE_FLAGS RC_PARAMS;
 
           VIMPLUGINS = lib.strings.concatStringsSep "," [
             "${proj-repo.packages.${system}.proj-nvim}"
@@ -128,12 +141,9 @@
               gh-markdown-preview
               shellcheck
               plantuml
-              gdb
               ruff
-              compdb
               jq
               gh
-              lcov # for code coverage
             ])
             (with pkgs.python3Packages; [
               gitpython
@@ -148,8 +158,16 @@
               black
               toml
             ])
+            (with self.packages.${system}; [
+              ffdb
+              hpp2plantuml
+            ])
           ];
         };
+
+        gpu = mkShell {
+          inputsFrom = [ gpu-ci default ];
+        };
       };
     }
   );
diff --git a/lib/compiler/include/compiler/cost_estimator/cost_estimator.h b/lib/compiler/include/compiler/cost_estimator/cost_estimator.h
index 65bae0c76a..ecaffa337b 100644
--- a/lib/compiler/include/compiler/cost_estimator/cost_estimator.h
+++ b/lib/compiler/include/compiler/cost_estimator/cost_estimator.h
@@ -2,6 +2,7 @@
 #define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_COST_ESTIMATOR_H
 
 #include "compiler/cost_estimator/op_cost_estimate_key.dtg.h"
+#include "compiler/cost_estimator/op_cost_metrics.dtg.h"
 #include "compiler/cost_estimator/tensor_set_movement.dtg.h"
 #include "op-attrs/parallel_tensor_shape.dtg.h"
 #include "op-attrs/pcg_operator_attrs.dtg.h"
@@ -11,7 +12,7 @@
 namespace FlexFlow {
 
 struct ICostEstimator {
-  virtual float estimate_cost(OpCostEstimateKey const &) const = 0;
+  virtual OpCostMetrics estimate_cost(OpCostEstimateKey const &) const = 0;
   virtual float estimate_cost(TensorSetMovement const &) const = 0;
 
   ICostEstimator() = default;
@@ -23,7 +24,7 @@ struct ICostEstimator {
 CHECK_RC_COPY_VIRTUAL_COMPLIANT(ICostEstimator);
 
 struct CostEstimator {
-  float estimate_cost(OpCostEstimateKey const &k) const;
+  OpCostMetrics estimate_cost(OpCostEstimateKey const &) const;
   float estimate_cost(TensorSetMovement const &m) const;
 
   template <typename T, typename... Args>
diff --git a/lib/compiler/include/compiler/cost_estimator/op_cost_metrics.struct.toml b/lib/compiler/include/compiler/cost_estimator/op_cost_metrics.struct.toml
new file mode 100644
index 0000000000..d2ff3f42e7
--- /dev/null
+++ b/lib/compiler/include/compiler/cost_estimator/op_cost_metrics.struct.toml
@@ -0,0 +1,19 @@
+namespace = "FlexFlow"
+name = "OpCostMetrics"
+features = [
+  "eq",
+  "fmt",
+  "hash",
+]
+
+includes = [
+  "utils/nonnegative_int/nonnegative_int.h"
+]
+
+[[fields]]
+name = "runtime"
+type = "float"
+
+[[fields]]
+name = "memory"
+type = "::FlexFlow::nonnegative_int"
diff --git a/lib/compiler/include/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.h b/lib/compiler/include/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.h
new file mode 100644
index 0000000000..d176d298db
--- /dev/null
+++ b/lib/compiler/include/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.h
@@ -0,0 +1,48 @@
+#ifndef _FLEXFLOW_COMPILER_MACHINE_MAPPING_MEMORY_OPTIMIZATION_GET_OPTIMAL_MACHINE_MAPPING_WITH_MEMORY_H
+#define _FLEXFLOW_COMPILER_MACHINE_MAPPING_MEMORY_OPTIMIZATION_GET_OPTIMAL_MACHINE_MAPPING_WITH_MEMORY_H
+
+#include "compiler/machine_mapping/machine_mapping_cache.dtg.h"
+#include "compiler/machine_mapping/machine_mapping_constraints.dtg.h"
+#include "compiler/machine_mapping/machine_mapping_context.dtg.h"
+#include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.dtg.h"
+#include "compiler/machine_mapping/machine_mapping_problem_tree/mm_problem_tree_parallel_split.dtg.h"
+#include "compiler/machine_mapping/machine_mapping_problem_tree/mm_problem_tree_series_split.dtg.h"
+#include "compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.dtg.h"
+#include "compiler/machine_mapping/parallel_split_transformation.dtg.h"
+#include "pcg/machine_specification.dtg.h"
+
+namespace FlexFlow {
+
+MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory(
+    MachineMappingWithMemoryCache &result_cache,
+    MachineMappingContext const &context,
+    MachineMappingProblemTree const &problem_tree,
+    MachineSpecification const &resources,
+    MachineMappingConstraints const &constraints);
+
+MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory(
+    MachineMappingWithMemoryCache &result_cache,
+    MachineMappingContext const &context,
+    MMProblemTreeSeriesSplit const &series_split,
+    MachineSpecification const &resources,
+    MachineMappingConstraints const &constraints,
+    std::optional<ParallelSplitTransformation> const
+        &parallel_split_transformation);
+
+MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory(
+    MachineMappingWithMemoryCache &result_cache,
+    MachineMappingContext const &context,
+    MMProblemTreeParallelSplit const &parallel_split,
+    MachineSpecification const &resources,
+    MachineMappingConstraints const &constraints);
+
+MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory(
+    MachineMappingWithMemoryCache &result_cache,
+    MachineMappingContext const &,
+    UnmappedOpCostEstimateKey const &leaf,
+    MachineSpecification const &resources,
+    MachineMappingConstraints const &constraints);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_for_single_layer.struct.toml b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_for_single_layer.struct.toml
new file mode 100644
index 0000000000..b61dd134c0
--- /dev/null
+++ b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_for_single_layer.struct.toml
@@ -0,0 +1,20 @@
+namespace = "FlexFlow"
+name = "MachineMappingForSingleLayer"
+features = [
+  "eq",
+  "hash",
+  "fmt",
+]
+
+includes = [
+  "compiler/machine_mapping/parallel_layer_guid_oblivious_machine_mapping.dtg.h",
+  "compiler/cost_estimator/op_cost_metrics.dtg.h",
+]
+
+[[fields]]
+name = "cost"
+type = "::FlexFlow::OpCostMetrics"
+
+[[fields]]
+name = "machine_mapping"
+type = "::FlexFlow::ParallelLayerGuidObliviousMachineMapping"
diff --git a/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.h b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.h
new file mode 100644
index 0000000000..b749235c89
--- /dev/null
+++ b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.h
@@ -0,0 +1,19 @@
+#ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_MACHINE_MAPPING_MEMORY_OPTIMIZATION_MACHINE_MAPPING_CACHE_WITH_MEMORY_H
+#define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_MACHINE_MAPPING_MEMORY_OPTIMIZATION_MACHINE_MAPPING_CACHE_WITH_MEMORY_H
+
+#include "compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.dtg.h"
+
+namespace FlexFlow {
+
+MachineMappingWithMemoryCache empty_machine_mapping_with_memory_cache();
+std::optional<MachineMappingWithMemoryResult>
+    machine_mapping_with_memory_cache_load(
+        MachineMappingWithMemoryCache const &, MachineMappingState const &);
+void machine_mapping_with_memory_cache_save(
+    MachineMappingWithMemoryCache &,
+    MachineMappingState const &,
+    MachineMappingWithMemoryResult const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.struct.toml b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.struct.toml
new file mode 100644
index 0000000000..c2fe393e99
--- /dev/null
+++ b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.struct.toml
@@ -0,0 +1,22 @@
+namespace = "FlexFlow"
+name = "MachineMappingWithMemoryCache"
+features = [
+  "eq",
+  "hash",
+  "fmt",
+]
+
+includes = [
+  "<unordered_map>",
+  "compiler/machine_mapping/machine_mapping_state.dtg.h",
+  "compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.dtg.h",
+]
+
+src_includes = [
+  "utils/fmt/unordered_map.h",
+  "utils/hash/unordered_map.h",
+]
+
+[[fields]]
+name = "raw_map"
+type = "std::unordered_map<::FlexFlow::MachineMappingState, ::FlexFlow::MachineMappingWithMemoryResult>"
diff --git a/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.h b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.h
new file mode 100644
index 0000000000..0383376116
--- /dev/null
+++ b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.h
@@ -0,0 +1,41 @@
+#ifndef _FLEXFLOW_COMPILER_MACHINE_MAPPING_MEMORY_OPTIMIZATION_MACHINE_MAPPING_RESULT_WITH_MEMORY_H
+#define _FLEXFLOW_COMPILER_MACHINE_MAPPING_MEMORY_OPTIMIZATION_MACHINE_MAPPING_RESULT_WITH_MEMORY_H
+
+#include "compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.dtg.h"
+#include "compiler/machine_mapping/parallel_split_transformation.dtg.h"
+#include <optional>
+
+namespace FlexFlow {
+
+[[nodiscard]] MachineMappingWithMemoryResult
+    empty_machine_mapping_with_memory_result();
+[[nodiscard]] bool is_empty(MachineMappingWithMemoryResult const &);
+
+[[nodiscard]] MachineMappingWithMemoryResult get_mapping_with_minimal_runtime(
+    std::unordered_set<MachineMappingWithMemoryResult> const &);
+
+[[nodiscard]] MachineMappingWithMemoryResult
+    remove_non_pareto_optimal_machine_mapping_result(
+        MachineMappingWithMemoryResult const &);
+
+[[nodiscard]] MachineMappingWithMemoryResult
+    series_combine(float comm_cost,
+                   MachineMappingWithMemoryResult const &pre_result,
+                   MachineMappingWithMemoryResult const &post_result,
+                   std::optional<ParallelSplitTransformation> const
+                       &parallel_split_transformation);
+[[nodiscard]] MachineMappingWithMemoryResult
+    parallel_combine(MachineMappingWithMemoryResult const &lhs_result,
+                     MachineMappingWithMemoryResult const &rhs_result);
+
+[[nodiscard]] MachineMappingWithMemoryResult
+    minimize_runtime(MachineMappingWithMemoryResult const &m1,
+                     MachineMappingWithMemoryResult const &m2);
+
+[[nodiscard]] MachineMappingWithMemoryResult
+    make_singleton_machine_mapping_with_memory_result(
+        OpCostMetrics cost, MachineView const &machine_view);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.struct.toml b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.struct.toml
new file mode 100644
index 0000000000..c1e1ee1cac
--- /dev/null
+++ b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.struct.toml
@@ -0,0 +1,20 @@
+namespace = "FlexFlow"
+name = "MachineMappingWithMemoryResult"
+features = [
+  "eq",
+  "hash",
+  "fmt",
+]
+
+includes = [
+  "compiler/machine_mapping/memory_optimization/machine_mapping_for_single_layer.dtg.h",
+]
+
+src_includes = [
+  "utils/hash/unordered_set.h",
+  "utils/fmt/unordered_set.h",
+]
+
+[[fields]]
+name = "machine_mappings"
+type = "std::unordered_set<::FlexFlow::MachineMappingForSingleLayer>"
diff --git a/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_memory_constraints.struct.toml b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_memory_constraints.struct.toml
new file mode 100644
index 0000000000..0d2572c783
--- /dev/null
+++ b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_memory_constraints.struct.toml
@@ -0,0 +1,13 @@
+namespace = "FlexFlow"
+name = "MachineMemoryConstraints"
+features = [
+  "eq",
+  "hash",
+  "fmt",
+]
+
+includes = []
+
+[[fields]]
+name = "memory_limit"
+type = "size_t"
diff --git a/lib/compiler/src/compiler/cost_estimator/cost_estimator.cc b/lib/compiler/src/compiler/cost_estimator/cost_estimator.cc
index 051ffcd190..6ac6e3a8d6 100644
--- a/lib/compiler/src/compiler/cost_estimator/cost_estimator.cc
+++ b/lib/compiler/src/compiler/cost_estimator/cost_estimator.cc
@@ -5,7 +5,7 @@ namespace FlexFlow {
 CostEstimator::CostEstimator(std::shared_ptr<ICostEstimator> implementation_ptr)
     : implementation_ptr(implementation_ptr) {}
 
-float CostEstimator::estimate_cost(OpCostEstimateKey const &k) const {
+OpCostMetrics CostEstimator::estimate_cost(OpCostEstimateKey const &k) const {
   return this->implementation_ptr->estimate_cost(k);
 }
 
diff --git a/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc b/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
index 10abd7ff90..5bdd8645a5 100644
--- a/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
+++ b/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
@@ -240,7 +240,7 @@ MachineMappingResult
   auto get_mapping_result = [&](MachineView const &machine_view) {
     OpCostEstimateKey mapped =
         map_unmapped_op_cost_estimate_key(leaf, machine_view);
-    float cost = context.cost_estimator.estimate_cost(mapped);
+    float cost = context.cost_estimator.estimate_cost(mapped).runtime;
 
     return make_singleton_machine_mapping_result(cost, machine_view);
   };
diff --git a/lib/compiler/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc b/lib/compiler/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
new file mode 100644
index 0000000000..b67083e8cd
--- /dev/null
+++ b/lib/compiler/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
@@ -0,0 +1,264 @@
+#include "compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.h"
+#include "compiler/machine_mapping/abstracted_tensor_set_movement/abstracted_tensor_set_movement.h"
+#include "compiler/machine_mapping/get_machine_resource_splits.h"
+#include "compiler/machine_mapping/machine_mapping_constraints.h"
+#include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h"
+#include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h"
+#include "compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.h"
+#include "compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.h"
+#include "compiler/machine_mapping/transitive_reduced_pcg.h"
+#include "compiler/series_parallel/pcg/pcg_binary_sp_decomposition.dtg.h"
+#include "compiler/series_parallel/pcg/pcg_binary_sp_decomposition.h"
+#include "pcg/machine_specification.dtg.h"
+#include "pcg/machine_specification.h"
+#include "pcg/machine_view.dtg.h"
+#include "pcg/machine_view.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.h"
+#include "utils/containers/contains.h"
+#include "utils/containers/flatmap.h"
+#include "utils/containers/generate_map.h"
+#include "utils/containers/get_all_assignments.h"
+#include "utils/containers/unordered_set_of.h"
+#include "utils/exception.h"
+#include "utils/overload.h"
+
+namespace FlexFlow {
+
+MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory(
+    MachineMappingWithMemoryCache &result_cache,
+    MachineMappingContext const &context,
+    MachineMappingProblemTree const &problem_tree,
+    MachineSpecification const &resources,
+    MachineMappingConstraints const &constraints) {
+
+  MachineMappingState state = MachineMappingState{
+      problem_tree,
+      resources,
+      constraints,
+  };
+
+  {
+    std::optional<MachineMappingWithMemoryResult> cached_result =
+        machine_mapping_with_memory_cache_load(result_cache, state);
+    if (cached_result) {
+      return cached_result.value();
+    }
+  }
+
+  MachineMappingWithMemoryResult result =
+      problem_tree.visit<MachineMappingWithMemoryResult>(overload{
+          [&](MMProblemTreeSeriesSplit const &series_split) {
+            return get_optimal_machine_mapping_with_memory(
+                result_cache,
+                context,
+                series_split,
+                resources,
+                constraints,
+                /*parallel_split_transformation=*/std::nullopt);
+          },
+          [&](auto const &decomp_tree_node) {
+            return get_optimal_machine_mapping_with_memory(result_cache,
+                                                           context,
+                                                           decomp_tree_node,
+                                                           resources,
+                                                           constraints);
+          },
+      });
+
+  machine_mapping_with_memory_cache_save(result_cache, state, result);
+  return result;
+}
+
+MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory(
+    MachineMappingWithMemoryCache &result_cache,
+    MachineMappingContext const &context,
+    MMProblemTreeSeriesSplit const &series_split,
+    MachineSpecification const &resources,
+    MachineMappingConstraints const &constraints,
+    std::optional<ParallelSplitTransformation> const
+        &parallel_split_transformation) {
+
+  auto get_boundary_machine_view_assignments =
+      [&](std::unordered_set<BinaryTreePath> const &boundary_layers)
+      -> std::unordered_set<ParallelLayerGuidObliviousMachineMapping> {
+    std::unordered_map<BinaryTreePath, std::unordered_set<MachineView>>
+        allowed = generate_map(
+            boundary_layers,
+            [&](BinaryTreePath const &l) -> std::unordered_set<MachineView> {
+              UnmappedOpCostEstimateKey leaf =
+                  mm_problem_tree_get_subtree_at_path(
+                      MachineMappingProblemTree{series_split}, l)
+                      .value()
+                      .get<UnmappedOpCostEstimateKey>();
+              return context.allowed_machine_views(leaf, resources);
+            });
+    return transform(
+        get_all_assignments(allowed),
+        [](std::unordered_map<BinaryTreePath, MachineView> const &m) {
+          return ParallelLayerGuidObliviousMachineMapping{m};
+        });
+  };
+
+  auto eval_pre_boundary_mapping =
+      [&](ParallelLayerGuidObliviousMachineMapping const
+              &assigned_pre_machine_views) {
+        MachineMappingConstraints pre_candidate = with_additional_constraints(
+            restrict_to_left_child(constraints), assigned_pre_machine_views);
+
+        MachineMappingWithMemoryResult pre_result =
+            get_optimal_machine_mapping_with_memory(
+                result_cache,
+                context,
+                series_split.get_left_child(),
+                resources,
+                pre_candidate);
+
+        return pre_result;
+      };
+
+  auto eval_post_boundary_mapping =
+      [&](ParallelLayerGuidObliviousMachineMapping const
+              &assigned_post_machine_views) {
+        MachineMappingConstraints post_candidate = with_additional_constraints(
+            restrict_to_right_child(constraints), assigned_post_machine_views);
+
+        MachineMappingWithMemoryResult post_result =
+            get_optimal_machine_mapping_with_memory(
+                result_cache,
+                context,
+                series_split.get_right_child(),
+                resources,
+                post_candidate);
+
+        return post_result;
+      };
+
+  MachineMappingWithMemoryResult result =
+      empty_machine_mapping_with_memory_result();
+  AbstractedTensorSetMovement tensor_movement =
+      series_split.tensor_set_movement;
+
+  for (ParallelLayerGuidObliviousMachineMapping const
+           &assigned_pre_machine_views :
+       get_boundary_machine_view_assignments(get_src_layers(tensor_movement))) {
+
+    MachineMappingWithMemoryResult pre_result =
+        eval_pre_boundary_mapping(assigned_pre_machine_views);
+
+    for (ParallelLayerGuidObliviousMachineMapping const
+             &assigned_post_machine_views :
+         get_boundary_machine_view_assignments(
+             get_dst_layers(tensor_movement))) {
+
+      MachineMappingWithMemoryResult post_result =
+          eval_post_boundary_mapping(assigned_post_machine_views);
+
+      TensorSetMovement comm_across_split =
+          concretize_abstracted_tensor_set_movement(
+              tensor_movement,
+              /*pre_mapping=*/assigned_pre_machine_views,
+              /*post_mapping=*/assigned_post_machine_views);
+      float cost_across_split =
+          context.cost_estimator.estimate_cost(comm_across_split);
+
+      result = minimize_runtime(result,
+                                series_combine(cost_across_split,
+                                               pre_result,
+                                               post_result,
+                                               parallel_split_transformation));
+    }
+  }
+
+  return result;
+}
+
+MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory(
+    MachineMappingWithMemoryCache &result_cache,
+    MachineMappingContext const &context,
+    MMProblemTreeParallelSplit const &parallel_split,
+    MachineSpecification const &resources,
+    MachineMappingConstraints const &constraints) {
+
+  MachineMappingProblemTree lhs = parallel_split.get_left_child();
+  MachineMappingProblemTree rhs = parallel_split.get_right_child();
+
+  MachineMappingWithMemoryResult series_result = [&] {
+    MMProblemTreeSeriesSplit series_split = MMProblemTreeSeriesSplit{
+        /*tensor_set_movement=*/empty_abstracted_tensor_set_movement(),
+        /*left_child=*/lhs,
+        /*right_child=*/rhs,
+    };
+
+    return get_optimal_machine_mapping_with_memory(
+        result_cache,
+        context,
+        series_split,
+        resources,
+        constraints,
+        ParallelSplitTransformation::LthenR);
+  }();
+
+  MachineMappingConstraints left_constraints =
+      restrict_to_left_child(constraints);
+  MachineMappingConstraints right_constraints =
+      restrict_to_right_child(constraints);
+
+  auto evaluate_resource_split =
+      [&](std::pair<MachineSpecification, MachineSpecification> const
+              &resource_split) {
+        MachineMappingWithMemoryResult left_result =
+            get_optimal_machine_mapping_with_memory(result_cache,
+                                                    context,
+                                                    lhs,
+                                                    resource_split.first,
+                                                    left_constraints);
+        MachineMappingWithMemoryResult right_result =
+            get_optimal_machine_mapping_with_memory(result_cache,
+                                                    context,
+                                                    rhs,
+                                                    resource_split.second,
+                                                    right_constraints);
+
+        return parallel_combine(left_result, right_result);
+      };
+
+  std::unordered_set<MachineMappingWithMemoryResult> parallel_results =
+      transform(get_machine_resource_splits(resources),
+                evaluate_resource_split);
+
+  return minimize_runtime(series_result,
+                          get_mapping_with_minimal_runtime(parallel_results));
+}
+
+MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory(
+    MachineMappingWithMemoryCache &result_cache,
+    MachineMappingContext const &context,
+    UnmappedOpCostEstimateKey const &leaf,
+    MachineSpecification const &resource,
+    MachineMappingConstraints const &constraints) {
+
+  std::unordered_set<MachineView> candidates = [&] {
+    std::optional<MachineView> machine_view = require_only_root(constraints);
+    if (machine_view.has_value()) {
+      return std::unordered_set{machine_view.value()};
+    } else {
+      return context.allowed_machine_views(leaf, resource);
+    }
+  }();
+
+  auto get_mapping_result = [&](MachineView const &machine_view) {
+    OpCostEstimateKey mapped =
+        map_unmapped_op_cost_estimate_key(leaf, machine_view);
+    OpCostMetrics cost = context.cost_estimator.estimate_cost(mapped);
+
+    return make_singleton_machine_mapping_with_memory_result(cost,
+                                                             machine_view);
+  };
+
+  std::unordered_set<MachineMappingWithMemoryResult> candidate_results =
+      transform(candidates, get_mapping_result);
+
+  return get_mapping_with_minimal_runtime(candidate_results);
+}
+
+} // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.cc b/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.cc
new file mode 100644
index 0000000000..617ba682be
--- /dev/null
+++ b/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.cc
@@ -0,0 +1,32 @@
+#include "compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.h"
+#include "utils/containers/contains_key.h"
+#include "utils/containers/try_at.h"
+
+namespace FlexFlow {
+
+MachineMappingWithMemoryCache empty_machine_mapping_with_memory_cache() {
+  return MachineMappingWithMemoryCache{{}};
+}
+
+std::optional<MachineMappingWithMemoryResult>
+    machine_mapping_with_memory_cache_load(
+        MachineMappingWithMemoryCache const &cache,
+        MachineMappingState const &k) {
+  return try_at(cache.raw_map, k);
+}
+
+void machine_mapping_with_memory_cache_save(
+    MachineMappingWithMemoryCache &cache,
+    MachineMappingState const &k,
+    MachineMappingWithMemoryResult const &v) {
+  if (contains_key(cache.raw_map, k)) {
+    throw mk_runtime_error(fmt::format(
+        "machine_mapping_with_memory_cache_save expected key to not already "
+        "exist, but received existing key {}",
+        k));
+  }
+
+  cache.raw_map.emplace(k, v);
+}
+
+} // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.cc b/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.cc
new file mode 100644
index 0000000000..a6c2d1ed04
--- /dev/null
+++ b/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.cc
@@ -0,0 +1,142 @@
+#include "compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.h"
+#include "compiler/machine_mapping/parallel_layer_guid_oblivious_machine_mapping.h"
+#include "utils/containers/set_union.h"
+#include "utils/full_binary_tree/binary_tree_path.h"
+
+namespace FlexFlow {
+
+MachineMappingWithMemoryResult empty_machine_mapping_with_memory_result() {
+  return MachineMappingWithMemoryResult{
+      {},
+  };
+}
+
+MachineMappingWithMemoryResult get_mapping_with_minimal_runtime(
+    std::unordered_set<MachineMappingWithMemoryResult> const &candidates) {
+  MachineMappingWithMemoryResult result =
+      empty_machine_mapping_with_memory_result();
+
+  for (MachineMappingWithMemoryResult const &candidate : candidates) {
+    result = minimize_runtime(result, candidate);
+  }
+
+  return result;
+}
+
+MachineMappingWithMemoryResult remove_non_pareto_optimal_machine_mapping_result(
+    MachineMappingWithMemoryResult const &result) {
+  std::unordered_set<MachineMappingForSingleLayer> non_pareto_optimal_mappings;
+  for (MachineMappingForSingleLayer const &mapping : result.machine_mappings) {
+    bool is_pareto_optimal = true;
+    for (MachineMappingForSingleLayer const &other_mapping :
+         result.machine_mappings) {
+      if (mapping.cost.runtime >= other_mapping.cost.runtime &&
+          mapping.cost.memory >= other_mapping.cost.memory &&
+          mapping != other_mapping) {
+        is_pareto_optimal = false;
+        break;
+      }
+    }
+    if (is_pareto_optimal) {
+      non_pareto_optimal_mappings.insert(mapping);
+    }
+  }
+  return MachineMappingWithMemoryResult{std::move(non_pareto_optimal_mappings)};
+}
+
+MachineMappingWithMemoryResult
+    series_combine(float comm_cost,
+                   MachineMappingWithMemoryResult const &pre_result,
+                   MachineMappingWithMemoryResult const &post_result,
+                   std::optional<ParallelSplitTransformation> const
+                       &parallel_split_transformation) {
+  auto combine_machine_mapping =
+      [&](MachineMappingForSingleLayer const &pre_mm,
+          MachineMappingForSingleLayer const &post_mm) {
+        OpCostMetrics cost = OpCostMetrics{
+            pre_mm.cost.runtime + comm_cost + post_mm.cost.runtime,
+            pre_mm.cost.memory + post_mm.cost.memory,
+        };
+
+        ParallelLayerGuidObliviousMachineMapping mapping = [&] {
+          if (parallel_split_transformation.has_value() &&
+              parallel_split_transformation.value() ==
+                  ParallelSplitTransformation::RthenL) {
+            return binary_combine_mappings(/*lhs=*/post_mm.machine_mapping,
+                                           /*rhs=*/pre_mm.machine_mapping);
+          } else {
+            return binary_combine_mappings(/*lhs=*/pre_mm.machine_mapping,
+                                           /*rhs=*/post_mm.machine_mapping);
+          }
+        }();
+
+        return MachineMappingForSingleLayer{cost, mapping};
+      };
+
+  MachineMappingWithMemoryResult result =
+      empty_machine_mapping_with_memory_result();
+  for (MachineMappingForSingleLayer const &pre_mm :
+       pre_result.machine_mappings) {
+    for (MachineMappingForSingleLayer const &post_mm :
+         post_result.machine_mappings) {
+      result.machine_mappings.insert(combine_machine_mapping(pre_mm, post_mm));
+    }
+  }
+
+  return remove_non_pareto_optimal_machine_mapping_result(result);
+}
+
+MachineMappingWithMemoryResult
+    parallel_combine(MachineMappingWithMemoryResult const &lhs_result,
+                     MachineMappingWithMemoryResult const &rhs_result) {
+  auto combine_machine_mapping =
+      [&](MachineMappingForSingleLayer const &lhs_mm,
+          MachineMappingForSingleLayer const &rhs_mm) {
+        OpCostMetrics cost = OpCostMetrics{
+            std::max(lhs_mm.cost.runtime, rhs_mm.cost.runtime),
+            std::max(lhs_mm.cost.memory, rhs_mm.cost.memory),
+        };
+
+        ParallelLayerGuidObliviousMachineMapping mapping =
+            binary_combine_mappings(lhs_mm.machine_mapping,
+                                    rhs_mm.machine_mapping);
+
+        return MachineMappingForSingleLayer{cost, mapping};
+      };
+
+  MachineMappingWithMemoryResult result =
+      empty_machine_mapping_with_memory_result();
+  for (MachineMappingForSingleLayer const &lhs_mm :
+       lhs_result.machine_mappings) {
+    for (MachineMappingForSingleLayer const &rhs_mm :
+         rhs_result.machine_mappings) {
+      result.machine_mappings.insert(combine_machine_mapping(lhs_mm, rhs_mm));
+    }
+  }
+
+  return remove_non_pareto_optimal_machine_mapping_result(result);
+}
+
+MachineMappingWithMemoryResult
+    minimize_runtime(MachineMappingWithMemoryResult const &m1,
+                     MachineMappingWithMemoryResult const &m2) {
+  MachineMappingWithMemoryResult result = MachineMappingWithMemoryResult{
+      set_union(m1.machine_mappings, m2.machine_mappings),
+  };
+  return remove_non_pareto_optimal_machine_mapping_result(result);
+}
+
+MachineMappingWithMemoryResult
+    make_singleton_machine_mapping_with_memory_result(
+        OpCostMetrics cost, MachineView const &machine_view) {
+  return MachineMappingWithMemoryResult{{
+      MachineMappingForSingleLayer{
+          cost,
+          ParallelLayerGuidObliviousMachineMapping{{
+              {binary_tree_root_path(), machine_view},
+          }},
+      },
+  }};
+}
+
+} // namespace FlexFlow
diff --git a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc b/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc
index 9ee596af3e..0431104878 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc
@@ -5,13 +5,15 @@
 namespace FlexFlow {
 
 TestCostEstimator::TestCostEstimator(
-    std::function<float(OpCostEstimateKey const &)> const &get_operator_cost,
+    std::function<OpCostMetrics(OpCostEstimateKey const &)> const
+        &get_operator_cost,
     std::function<float(TensorSetMovement const &)> const
         &get_communication_cost)
     : get_operator_cost(get_operator_cost),
       get_communication_cost(get_communication_cost) {}
 
-float TestCostEstimator::estimate_cost(OpCostEstimateKey const &k) const {
+OpCostMetrics
+    TestCostEstimator::estimate_cost(OpCostEstimateKey const &k) const {
   return this->get_operator_cost(k);
 }
 
@@ -20,16 +22,16 @@ float TestCostEstimator::estimate_cost(TensorSetMovement const &m) const {
 }
 
 CostEstimator make_fake_cost_estimator(
-    std::function<float(OpCostEstimateKey const &)> const &get_operator_cost,
+    std::function<OpCostMetrics(OpCostEstimateKey const &)> const
+        &get_operator_cost,
     std::function<float(TensorSetMovement const &)> const
         &get_communication_cost) {
-
   return CostEstimator::create<TestCostEstimator>(get_operator_cost,
                                                   get_communication_cost);
 }
 
 CostEstimator make_fake_cost_estimator(
-    std::unordered_map<OpCostEstimateKey, float> const &op_cost_map,
+    std::unordered_map<OpCostEstimateKey, OpCostMetrics> const &op_cost_map,
     std::unordered_map<TensorSetMovement, float> const &comm_cost_map) {
   return make_fake_cost_estimator(
       [op_cost_map](OpCostEstimateKey const &k) { return op_cost_map.at(k); },
diff --git a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h b/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h
index 7c1d06207a..16ea3a85bc 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h
+++ b/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h
@@ -11,7 +11,7 @@
 namespace FlexFlow {
 
 struct TestCostEstimator : public ICostEstimator {
-  std::function<float(OpCostEstimateKey const &)> get_operator_cost;
+  std::function<OpCostMetrics(OpCostEstimateKey const &)> get_operator_cost;
   std::function<float(TensorSetMovement const &)> get_communication_cost;
 
   TestCostEstimator() = delete;
@@ -19,18 +19,19 @@ struct TestCostEstimator : public ICostEstimator {
                     decltype(get_communication_cost)
                         const &get_communication_cost);
 
-  float estimate_cost(OpCostEstimateKey const &) const override;
+  OpCostMetrics estimate_cost(OpCostEstimateKey const &) const override;
 
   float estimate_cost(TensorSetMovement const &) const override;
 };
 
 CostEstimator make_fake_cost_estimator(
-    std::function<float(OpCostEstimateKey const &)> const &get_operator_cost,
+    std::function<OpCostMetrics(OpCostEstimateKey const &)> const
+        &get_operator_cost,
     std::function<float(TensorSetMovement const &)> const
         &get_communication_cost);
 
 CostEstimator make_fake_cost_estimator(
-    std::unordered_map<OpCostEstimateKey, float> const &op_cost_map,
+    std::unordered_map<OpCostEstimateKey, OpCostMetrics> const &op_cost_map,
     std::unordered_map<TensorSetMovement, float> const &comm_cost_map);
 
 } // namespace FlexFlow
diff --git a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
index a0d06fe930..ac180cd079 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
@@ -144,13 +144,19 @@ TEST_SUITE(FF_TEST_SUITE) {
             {binary_tree_root_path(), mv2},
         }};
 
+    auto map1 = std::unordered_map<OpCostEstimateKey, OpCostMetrics>{{
+        {map_unmapped_op_cost_estimate_key(k1, mv1),
+         OpCostMetrics{/*runtime=*/1.0, /*memory=*/nonnegative_int{0}}},
+        {map_unmapped_op_cost_estimate_key(k2, mv1),
+         OpCostMetrics{/*runtime=*/2.0, /*memory=*/nonnegative_int{0}}},
+        {map_unmapped_op_cost_estimate_key(k1, mv2),
+         OpCostMetrics{/*runtime=*/1.5, /*memory=*/nonnegative_int{0}}},
+        {map_unmapped_op_cost_estimate_key(k2, mv2),
+         OpCostMetrics{/*runtime=*/2.5, /*memory=*/nonnegative_int{0}}},
+    }};
+
     CostEstimator cost_estimator = make_fake_cost_estimator(
-        std::unordered_map<OpCostEstimateKey, float>{{
-            {map_unmapped_op_cost_estimate_key(k1, mv1), 1.0},
-            {map_unmapped_op_cost_estimate_key(k2, mv1), 2.0},
-            {map_unmapped_op_cost_estimate_key(k1, mv2), 1.5},
-            {map_unmapped_op_cost_estimate_key(k2, mv2), 2.5},
-        }},
+        map1,
         std::unordered_map<TensorSetMovement, float>{{
             {TensorSetMovement{{}}, 0.0},
             {concretize_abstracted_tensor_set_movement(movement1, mm1, mm1),
diff --git a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
new file mode 100644
index 0000000000..9706f1c75f
--- /dev/null
+++ b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
@@ -0,0 +1,294 @@
+#include "compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.h"
+#include "../cost_estimator_for_test.h"
+#include "compiler/machine_mapping/abstracted_tensor_set_movement/abstracted_tensor_set_movement.h"
+#include "compiler/machine_mapping/machine_mapping_constraints.h"
+#include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h"
+#include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h"
+#include "compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.h"
+#include "pcg/machine_view.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h"
+#include "utils/containers/get_only.h"
+#include "utils/full_binary_tree/binary_tree_path.h"
+#include <doctest/doctest.h>
+
+using namespace FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("get_optimal_machine_mapping_with_memory") {
+    auto make_leaf = [](UnmappedOpCostEstimateKey const &k) {
+      return MachineMappingProblemTree{k};
+    };
+
+    auto make_series_split =
+        [](AbstractedTensorSetMovement const &tensor_set_movement,
+           MachineMappingProblemTree const &lhs,
+           MachineMappingProblemTree const &rhs) {
+          return MachineMappingProblemTree{
+              MMProblemTreeSeriesSplit{
+                  /*tensor_set_movement=*/tensor_set_movement,
+                  /*left_child=*/lhs,
+                  /*right_child=*/rhs,
+              },
+          };
+        };
+
+    auto make_parallel_split = [](MachineMappingProblemTree const &lhs,
+                                  MachineMappingProblemTree const &rhs) {
+      return MachineMappingProblemTree{
+          MMProblemTreeParallelSplit{
+              /*left_child=*/lhs,
+              /*right_child=*/rhs,
+          },
+      };
+    };
+
+    MachineView mv1 = MachineView{
+        /*start=*/MachineSpaceCoordinate{
+            /*node_idx=*/0,
+            /*device_idx=*/0,
+            /*device_type=*/DeviceType::GPU,
+        },
+        /*dimensions=*/
+        {
+            MachineViewDimension{
+                stride_t{1},
+                MachineSpecificationDimension::INTRA_NODE,
+            },
+        },
+    };
+
+    MachineView mv2 = MachineView{
+        /*start=*/MachineSpaceCoordinate{
+            /*node_idx=*/0,
+            /*device_idx=*/0,
+            /*device_type=*/DeviceType::GPU,
+        },
+        /*dimensions=*/
+        {
+            MachineViewDimension{
+                stride_t{2},
+                MachineSpecificationDimension::INTRA_NODE,
+            },
+        },
+    };
+
+    MachineSpecification full_machine_spec = MachineSpecification{
+        /*num_nodes=*/2,
+        /*num_cpus_per_node=*/1,
+        /*num_gpus_per_node=*/1,
+        /*inter_node_bandwidth=*/1,
+        /*intra_node_bandwidth=*/1,
+    };
+
+    MachineSpecification split_machine_spec = MachineSpecification{
+        /*num_nodes=*/1,
+        /*num_cpus_per_node=*/1,
+        /*num_gpus_per_node=*/1,
+        /*inter_node_bandwidth=*/1,
+        /*intra_node_bandwidth=*/1,
+    };
+
+    auto allowed_machine_views1 = [&](UnmappedOpCostEstimateKey const &,
+                                      MachineSpecification const &resources) {
+      if (resources == full_machine_spec) {
+        return std::unordered_set<MachineView>{mv1, mv2};
+      } else {
+        return std::unordered_set<MachineView>{mv2};
+      }
+    };
+
+    UnmappedOpCostEstimateKey k1 = UnmappedOpCostEstimateKey{
+        /*op_attrs=*/PCGOperatorAttrs{InputAttrs{}},
+        /*input_shapes=*/{},
+        /*weight_shapes=*/{},
+        /*output_shapes=*/{},
+    };
+
+    UnmappedOpCostEstimateKey k2 = UnmappedOpCostEstimateKey{
+        /*op_attrs=*/PCGOperatorAttrs{ElementBinaryAttrs{
+            /*type=*/OperatorType::EW_ADD,
+            /*compute_type=*/DataType::FLOAT,
+            /*should_broadcast_lhs=*/false,
+            /*should_broadcast_rhs=*/false,
+        }},
+        /*input_shapes=*/{},
+        /*weight_shapes=*/{},
+        /*output_shapes=*/{},
+    };
+
+    ParallelTensorShape tensor_shape1 = ParallelTensorShape{
+        ParallelTensorDims{
+            FFOrdered<ShardParallelDim>{},
+            ReplicaParallelDimSet{
+                SumDegree{1},
+                DiscardCopyDegree{1},
+            },
+        },
+        DataType::FLOAT,
+    };
+
+    AbstractedTensorSetMovement movement1 = AbstractedTensorSetMovement{{
+        AbstractedSingleTensorMovement{
+            /*parallel_tensor_shape=*/tensor_shape1,
+            /*src_machine_views=*/{},
+            /*dst_machine_views=*/{},
+        },
+    }};
+
+    ParallelLayerGuidObliviousMachineMapping mm1 =
+        ParallelLayerGuidObliviousMachineMapping{{
+            {binary_tree_root_path(), mv1},
+        }};
+    ParallelLayerGuidObliviousMachineMapping mm2 =
+        ParallelLayerGuidObliviousMachineMapping{{
+            {binary_tree_root_path(), mv2},
+        }};
+
+    CostEstimator cost_estimator = make_fake_cost_estimator(
+        std::unordered_map<OpCostEstimateKey, OpCostMetrics>{{
+            {map_unmapped_op_cost_estimate_key(k1, mv1),
+             OpCostMetrics{1.0, nonnegative_int{2}}},
+            {map_unmapped_op_cost_estimate_key(k2, mv1),
+             OpCostMetrics{2.0, nonnegative_int{3}}},
+            {map_unmapped_op_cost_estimate_key(k1, mv2),
+             OpCostMetrics{1.5, nonnegative_int{1}}},
+            {map_unmapped_op_cost_estimate_key(k2, mv2),
+             OpCostMetrics{2.5, nonnegative_int{2}}},
+        }},
+        std::unordered_map<TensorSetMovement, float>{{
+            {TensorSetMovement{{}}, 0.0},
+            {concretize_abstracted_tensor_set_movement(movement1, mm1, mm1),
+             0.1},
+            {concretize_abstracted_tensor_set_movement(movement1, mm2, mm2),
+             0.2},
+            {concretize_abstracted_tensor_set_movement(movement1, mm1, mm2),
+             0.3},
+            {concretize_abstracted_tensor_set_movement(movement1, mm2, mm1),
+             0.4},
+        }});
+
+    MachineMappingContext context = MachineMappingContext{
+        cost_estimator,
+        allowed_machine_views1,
+    };
+
+    MachineMappingWithMemoryCache cache =
+        empty_machine_mapping_with_memory_cache();
+
+    SUBCASE("single layer") {
+      MachineMappingProblemTree problem_tree = make_leaf(k1);
+
+      MachineMappingConstraints constraints =
+          get_unconstrained_solution_for_layers(
+              get_all_leaf_paths(problem_tree));
+
+      MachineMappingWithMemoryResult result =
+          get_optimal_machine_mapping_with_memory(
+              cache, context, problem_tree, full_machine_spec, constraints);
+      MachineMappingWithMemoryResult correct = MachineMappingWithMemoryResult{{
+          MachineMappingForSingleLayer{
+              OpCostMetrics{1.0, nonnegative_int{2}},
+              ParallelLayerGuidObliviousMachineMapping{{
+                  {binary_tree_root_path(), mv1},
+              }},
+          },
+          MachineMappingForSingleLayer{
+              OpCostMetrics{1.5, nonnegative_int{1}},
+              ParallelLayerGuidObliviousMachineMapping{{
+                  {binary_tree_root_path(), mv2},
+              }},
+          },
+      }};
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("pair of layers in sequence") {
+      MachineMappingProblemTree problem_tree =
+          make_series_split(movement1, make_leaf(k1), make_leaf(k2));
+
+      MachineMappingConstraints constraints =
+          get_unconstrained_solution_for_layers(
+              get_all_leaf_paths(problem_tree));
+
+      MachineMappingWithMemoryResult result =
+          get_optimal_machine_mapping_with_memory(
+              cache, context, problem_tree, full_machine_spec, constraints);
+      MachineMappingWithMemoryResult correct = MachineMappingWithMemoryResult{{
+          MachineMappingForSingleLayer{
+              OpCostMetrics{
+                  /*runtime=*/1.0 + 2.0 + 0.1,
+                  /*memory=*/nonnegative_int{2 + 3},
+              },
+              ParallelLayerGuidObliviousMachineMapping{{
+                  {
+                      BinaryTreePath{{
+                          BinaryTreePathEntry::LEFT_CHILD,
+                      }},
+                      mv1,
+                  },
+                  {
+                      BinaryTreePath{{
+                          BinaryTreePathEntry::RIGHT_CHILD,
+                      }},
+                      mv1,
+                  },
+              }},
+          },
+          MachineMappingForSingleLayer{
+              OpCostMetrics{1.5 + 2.5 + 0.1, nonnegative_int{1 + 2}},
+              ParallelLayerGuidObliviousMachineMapping{{
+                  {
+                      BinaryTreePath{{
+                          BinaryTreePathEntry::LEFT_CHILD,
+                      }},
+                      mv2,
+                  },
+                  {
+                      BinaryTreePath{{
+                          BinaryTreePathEntry::RIGHT_CHILD,
+                      }},
+                      mv2,
+                  },
+              }},
+          },
+      }};
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("pair of layers in parallel") {
+      MachineMappingProblemTree problem_tree =
+          make_parallel_split(make_leaf(k1), make_leaf(k2));
+
+      MachineMappingConstraints constraints =
+          get_unconstrained_solution_for_layers(
+              get_all_leaf_paths(problem_tree));
+
+      MachineMappingWithMemoryResult result =
+          get_optimal_machine_mapping_with_memory(
+              cache, context, problem_tree, full_machine_spec, constraints);
+      MachineMappingWithMemoryResult correct =
+          MachineMappingWithMemoryResult{{MachineMappingForSingleLayer{
+              OpCostMetrics{2.5, nonnegative_int{2}},
+              ParallelLayerGuidObliviousMachineMapping{{
+                  {
+                      BinaryTreePath{{
+                          BinaryTreePathEntry::LEFT_CHILD,
+                      }},
+                      mv2,
+                  },
+                  {
+                      BinaryTreePath{{
+                          BinaryTreePathEntry::RIGHT_CHILD,
+                      }},
+                      mv2,
+                  },
+              }},
+
+          }}};
+
+      CHECK(result == correct);
+    }
+  }
+}
diff --git a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc
new file mode 100644
index 0000000000..ecfb7cfeb3
--- /dev/null
+++ b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc
@@ -0,0 +1,593 @@
+#include "compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.h"
+#include "pcg/machine_view.h"
+#include <doctest/doctest.h>
+
+using namespace FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("remove_non_pareto_optimal_machine_mapping_result") {
+    MachineView machine_view_0 = MachineView{
+        /*start=*/MachineSpaceCoordinate{
+            /*node_idx=*/0,
+            /*device_idx=*/0,
+            /*device_type=*/DeviceType::GPU,
+        },
+        /*dimensions=*/
+        {
+            MachineViewDimension{
+                stride_t{1},
+                MachineSpecificationDimension::INTRA_NODE,
+            },
+        },
+    };
+
+    MachineView machine_view_1 = MachineView{
+        /*start=*/MachineSpaceCoordinate{
+            /*node_idx=*/0,
+            /*device_idx=*/0,
+            /*device_type=*/DeviceType::GPU,
+        },
+        /*dimensions=*/
+        {
+            MachineViewDimension{
+                stride_t{2},
+                MachineSpecificationDimension::INTRA_NODE,
+            },
+        },
+    };
+
+    MachineView machine_view_2 = MachineView{
+        /*start=*/MachineSpaceCoordinate{
+            /*node_idx=*/0,
+            /*device_idx=*/0,
+            /*device_type=*/DeviceType::GPU,
+        },
+        /*dimensions=*/
+        {
+            MachineViewDimension{
+                stride_t{4},
+                MachineSpecificationDimension::INTRA_NODE,
+            },
+        },
+    };
+
+    OpCostMetrics cost1 = OpCostMetrics{
+        /*runtime=*/2.0,
+        /*memory=*/nonnegative_int{2},
+    };
+    OpCostMetrics cost2 = OpCostMetrics{
+        /*runtime=*/4.0,
+        /*memory=*/nonnegative_int{1},
+    };
+    OpCostMetrics cost3 = OpCostMetrics{
+        /*runtime=*/2.0,
+        /*memory=*/nonnegative_int{3},
+    };
+
+    MachineMappingForSingleLayer mm1 = MachineMappingForSingleLayer{
+        cost1,
+        ParallelLayerGuidObliviousMachineMapping{
+            {
+                {
+                    BinaryTreePath{{}},
+                    machine_view_0,
+                },
+            },
+        },
+    };
+
+    MachineMappingForSingleLayer mm2 = MachineMappingForSingleLayer{
+        cost2,
+        ParallelLayerGuidObliviousMachineMapping{
+            {
+                {
+                    BinaryTreePath{{}},
+                    machine_view_1,
+                },
+            },
+        },
+    };
+
+    MachineMappingForSingleLayer mm3 = MachineMappingForSingleLayer{
+        cost3,
+        ParallelLayerGuidObliviousMachineMapping{
+            {
+                {
+                    BinaryTreePath{{}},
+                    machine_view_2,
+                },
+            },
+        },
+    };
+
+    SUBCASE("empty") {
+      MachineMappingWithMemoryResult before_remove =
+          empty_machine_mapping_with_memory_result();
+      MachineMappingWithMemoryResult result =
+          remove_non_pareto_optimal_machine_mapping_result(before_remove);
+      MachineMappingWithMemoryResult correct =
+          empty_machine_mapping_with_memory_result();
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("all solutions are pareto-optimal") {
+      MachineMappingWithMemoryResult before_remove =
+          MachineMappingWithMemoryResult{
+              {
+                  mm1,
+                  mm2,
+              },
+          };
+      MachineMappingWithMemoryResult result =
+          remove_non_pareto_optimal_machine_mapping_result(before_remove);
+      MachineMappingWithMemoryResult correct = before_remove;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("there exists a non-pareto-optimal solution") {
+      MachineMappingWithMemoryResult before_remove =
+          MachineMappingWithMemoryResult{
+              {
+                  mm1,
+                  mm2,
+                  mm3,
+              },
+          };
+      MachineMappingWithMemoryResult result =
+          remove_non_pareto_optimal_machine_mapping_result(before_remove);
+      MachineMappingWithMemoryResult correct = MachineMappingWithMemoryResult{
+          {
+              mm1,
+              mm2,
+          },
+      };
+
+      CHECK(result == correct);
+    }
+  }
+
+  TEST_CASE("series_combine(float, MachineMappingWithMemoryResult const &, "
+            "MachineMappingWithMemoryResult const &, "
+            "std::optional<ParallelSplitTransformation> const&)") {
+    MachineView machine_view_0 = MachineView{
+        /*start=*/MachineSpaceCoordinate{
+            /*node_idx=*/0,
+            /*device_idx=*/0,
+            /*device_type=*/DeviceType::GPU,
+        },
+        /*dimensions=*/
+        {
+            MachineViewDimension{
+                stride_t{1},
+                MachineSpecificationDimension::INTRA_NODE,
+            },
+        },
+    };
+
+    MachineView machine_view_1 = MachineView{
+        /*start=*/MachineSpaceCoordinate{
+            /*node_idx=*/0,
+            /*device_idx=*/0,
+            /*device_type=*/DeviceType::GPU,
+        },
+        /*dimensions=*/
+        {
+            MachineViewDimension{
+                stride_t{2},
+                MachineSpecificationDimension::INTRA_NODE,
+            },
+        },
+    };
+
+    OpCostMetrics pre_cost = OpCostMetrics{
+        /*runtime=*/2.0,
+        /*memory=*/nonnegative_int{2},
+    };
+    MachineMappingWithMemoryResult pre = MachineMappingWithMemoryResult{{
+        MachineMappingForSingleLayer{
+            pre_cost,
+            ParallelLayerGuidObliviousMachineMapping{
+                {
+                    {
+                        BinaryTreePath{
+                            {BinaryTreePathEntry::LEFT_CHILD},
+                        },
+                        machine_view_0,
+                    },
+                    {
+                        BinaryTreePath{
+                            {BinaryTreePathEntry::RIGHT_CHILD},
+                        },
+                        machine_view_1,
+                    },
+                },
+            },
+        },
+    }};
+
+    OpCostMetrics post_cost = OpCostMetrics{
+        /*runtime=*/4.0,
+        /*memory=*/nonnegative_int{1},
+    };
+
+    MachineMappingWithMemoryResult post = MachineMappingWithMemoryResult{{
+        MachineMappingForSingleLayer{
+            post_cost,
+            ParallelLayerGuidObliviousMachineMapping{
+                {
+                    {
+                        BinaryTreePath{{}},
+                        machine_view_1,
+                    },
+                },
+            },
+        },
+    }};
+
+    MachineMappingWithMemoryResult empty =
+        empty_machine_mapping_with_memory_result();
+
+    float comm_cost = 3.0;
+
+    SUBCASE("pre is empty") {
+      MachineMappingWithMemoryResult result = series_combine(
+          comm_cost, empty, post, ParallelSplitTransformation::LthenR);
+      MachineMappingWithMemoryResult correct = empty;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("post is empty") {
+      MachineMappingWithMemoryResult result = series_combine(
+          comm_cost, pre, empty, ParallelSplitTransformation::LthenR);
+      MachineMappingWithMemoryResult correct = empty;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("both are nonempty") {
+      MachineMappingWithMemoryResult no_parallel_split_transform =
+          MachineMappingWithMemoryResult{
+              {
+                  MachineMappingForSingleLayer{
+                      /*cost=*/OpCostMetrics{
+                          /*runtime=*/pre_cost.runtime + comm_cost +
+                              post_cost.runtime,
+                          /*memory=*/pre_cost.memory + post_cost.memory,
+                      },
+                      /*machine_mapping=*/
+                      ParallelLayerGuidObliviousMachineMapping{{
+                          {
+                              BinaryTreePath{{
+                                  BinaryTreePathEntry::LEFT_CHILD,
+                                  BinaryTreePathEntry::LEFT_CHILD,
+                              }},
+                              machine_view_0,
+                          },
+                          {
+                              BinaryTreePath{{
+                                  BinaryTreePathEntry::LEFT_CHILD,
+                                  BinaryTreePathEntry::RIGHT_CHILD,
+                              }},
+                              machine_view_1,
+                          },
+                          {
+                              BinaryTreePath{{
+                                  BinaryTreePathEntry::RIGHT_CHILD,
+                              }},
+                              machine_view_1,
+                          },
+                      }},
+                  },
+              },
+          };
+
+      SUBCASE("parallel_split_transformation = std::nullopt") {
+        MachineMappingWithMemoryResult result =
+            series_combine(comm_cost, pre, post, std::nullopt);
+        MachineMappingWithMemoryResult correct = no_parallel_split_transform;
+
+        CHECK(result == correct);
+      }
+
+      SUBCASE("parallel_split_transformation = LthenR") {
+        MachineMappingWithMemoryResult result = series_combine(
+            comm_cost, pre, post, ParallelSplitTransformation::LthenR);
+        MachineMappingWithMemoryResult correct = no_parallel_split_transform;
+
+        CHECK(result == correct);
+      }
+
+      SUBCASE("parallel_split_transformation = RthenL") {
+        MachineMappingWithMemoryResult result = series_combine(
+            comm_cost, pre, post, ParallelSplitTransformation::RthenL);
+        MachineMappingWithMemoryResult correct = MachineMappingWithMemoryResult{
+            {
+                MachineMappingForSingleLayer{
+                    /*cost=*/OpCostMetrics{
+                        /*runtime=*/pre_cost.runtime + comm_cost +
+                            post_cost.runtime,
+                        /*memory=*/pre_cost.memory + post_cost.memory,
+                    },
+                    /*machine_mapping=*/
+                    ParallelLayerGuidObliviousMachineMapping{{
+                        {
+                            BinaryTreePath{{
+                                BinaryTreePathEntry::RIGHT_CHILD,
+                                BinaryTreePathEntry::LEFT_CHILD,
+                            }},
+                            machine_view_0,
+                        },
+                        {
+                            BinaryTreePath{{
+                                BinaryTreePathEntry::RIGHT_CHILD,
+                                BinaryTreePathEntry::RIGHT_CHILD,
+                            }},
+                            machine_view_1,
+                        },
+                        {
+                            BinaryTreePath{{
+                                BinaryTreePathEntry::LEFT_CHILD,
+                            }},
+                            machine_view_1,
+                        },
+                    }},
+                },
+            },
+        };
+
+        CHECK(result == correct);
+      }
+    }
+  }
+
+  TEST_CASE("parallel_combine(float, MachineMappingWithMemoryResult const &, "
+            "MachineMappingWithMemoryResult const &, "
+            "std::optional<ParallelSplitTransformation> const&)") {
+    MachineView machine_view_0 = MachineView{
+        /*start=*/MachineSpaceCoordinate{
+            /*node_idx=*/0,
+            /*device_idx=*/0,
+            /*device_type=*/DeviceType::GPU,
+        },
+        /*dimensions=*/
+        {
+            MachineViewDimension{
+                stride_t{1},
+                MachineSpecificationDimension::INTRA_NODE,
+            },
+        },
+    };
+
+    MachineView machine_view_1 = MachineView{
+        /*start=*/MachineSpaceCoordinate{
+            /*node_idx=*/0,
+            /*device_idx=*/0,
+            /*device_type=*/DeviceType::GPU,
+        },
+        /*dimensions=*/
+        {
+            MachineViewDimension{
+                stride_t{2},
+                MachineSpecificationDimension::INTRA_NODE,
+            },
+        },
+    };
+
+    OpCostMetrics lhs_cost = OpCostMetrics{
+        /*runtime=*/2.0,
+        /*memory=*/nonnegative_int{2},
+    };
+    MachineMappingWithMemoryResult lhs = MachineMappingWithMemoryResult{{
+        MachineMappingForSingleLayer{
+            lhs_cost,
+            ParallelLayerGuidObliviousMachineMapping{
+                {
+                    {
+                        BinaryTreePath{
+                            {BinaryTreePathEntry::LEFT_CHILD},
+                        },
+                        machine_view_0,
+                    },
+                    {
+                        BinaryTreePath{
+                            {BinaryTreePathEntry::RIGHT_CHILD},
+                        },
+                        machine_view_1,
+                    },
+                },
+            },
+        },
+    }};
+
+    OpCostMetrics rhs_cost = OpCostMetrics{
+        /*runtime=*/4.0,
+        /*memory=*/nonnegative_int{1},
+    };
+    MachineMappingWithMemoryResult rhs = MachineMappingWithMemoryResult{{
+        MachineMappingForSingleLayer{
+            rhs_cost,
+            ParallelLayerGuidObliviousMachineMapping{
+                {
+                    {
+                        BinaryTreePath{{}},
+                        machine_view_1,
+                    },
+                },
+            },
+        },
+    }};
+
+    MachineMappingWithMemoryResult empty =
+        empty_machine_mapping_with_memory_result();
+
+    SUBCASE("lhs is empty") {
+      MachineMappingWithMemoryResult result = parallel_combine(empty, rhs);
+      MachineMappingWithMemoryResult correct = empty;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("rhs is empty") {
+      MachineMappingWithMemoryResult result = parallel_combine(lhs, empty);
+      MachineMappingWithMemoryResult correct = empty;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("both are nonempty") {
+      MachineMappingWithMemoryResult result = parallel_combine(lhs, rhs);
+      MachineMappingWithMemoryResult correct = MachineMappingWithMemoryResult{{
+          MachineMappingForSingleLayer{
+              /*cost=*/OpCostMetrics{
+                  /*runtime=*/std::max(lhs_cost.runtime, rhs_cost.runtime),
+                  /*memory=*/std::max(lhs_cost.memory, rhs_cost.memory),
+              },
+              /*machine_mapping=*/
+              ParallelLayerGuidObliviousMachineMapping{
+                  {
+                      {
+                          BinaryTreePath{{BinaryTreePathEntry::LEFT_CHILD,
+                                          BinaryTreePathEntry::LEFT_CHILD}},
+                          machine_view_0,
+                      },
+                      {
+                          BinaryTreePath{{BinaryTreePathEntry::LEFT_CHILD,
+                                          BinaryTreePathEntry::RIGHT_CHILD}},
+                          machine_view_1,
+                      },
+                      {
+                          BinaryTreePath{{BinaryTreePathEntry::RIGHT_CHILD}},
+                          machine_view_1,
+                      },
+                  },
+              },
+          },
+      }};
+
+      CHECK(result == correct);
+    }
+  }
+
+  TEST_CASE("minimize_runtime(memory)") {
+    MachineView machine_view_0 = MachineView{
+        /*start=*/MachineSpaceCoordinate{
+            /*node_idx=*/0,
+            /*device_idx=*/0,
+            /*device_type=*/DeviceType::GPU,
+        },
+        /*dimensions=*/
+        {
+            MachineViewDimension{
+                stride_t{1},
+                MachineSpecificationDimension::INTRA_NODE,
+            },
+        },
+    };
+
+    MachineView machine_view_1 = MachineView{
+        /*start=*/MachineSpaceCoordinate{
+            /*node_idx=*/0,
+            /*device_idx=*/0,
+            /*device_type=*/DeviceType::GPU,
+        },
+        /*dimensions=*/
+        {
+            MachineViewDimension{
+                stride_t{2},
+                MachineSpecificationDimension::INTRA_NODE,
+            },
+        },
+    };
+
+    MachineView machine_view_2 = MachineView{
+        /*start=*/MachineSpaceCoordinate{
+            /*node_idx=*/0,
+            /*device_idx=*/0,
+            /*device_type=*/DeviceType::GPU,
+        },
+        /*dimensions=*/
+        {
+            MachineViewDimension{
+                stride_t{4},
+                MachineSpecificationDimension::INTRA_NODE,
+            },
+        },
+    };
+
+    OpCostMetrics cost1 = OpCostMetrics{
+        /*runtime=*/2.0,
+        /*memory=*/nonnegative_int{2},
+    };
+    OpCostMetrics cost2 = OpCostMetrics{
+        /*runtime=*/4.0,
+        /*memory=*/nonnegative_int{1},
+    };
+    OpCostMetrics cost3 = OpCostMetrics{
+        /*runtime=*/2.0,
+        /*memory=*/nonnegative_int{3},
+    };
+
+    MachineMappingForSingleLayer mm1 = MachineMappingForSingleLayer{
+        cost1,
+        ParallelLayerGuidObliviousMachineMapping{
+            {
+                {
+                    BinaryTreePath{{}},
+                    machine_view_0,
+                },
+            },
+        },
+    };
+
+    MachineMappingForSingleLayer mm2 = MachineMappingForSingleLayer{
+        cost2,
+        ParallelLayerGuidObliviousMachineMapping{
+            {
+                {
+                    BinaryTreePath{{}},
+                    machine_view_1,
+                },
+            },
+        },
+    };
+
+    MachineMappingForSingleLayer mm3 = MachineMappingForSingleLayer{
+        cost3,
+        ParallelLayerGuidObliviousMachineMapping{
+            {
+                {
+                    BinaryTreePath{{}},
+                    machine_view_2,
+                },
+            },
+        },
+    };
+
+    MachineMappingWithMemoryResult result1 = MachineMappingWithMemoryResult{
+        {
+            mm1,
+            mm2,
+        },
+    };
+
+    MachineMappingWithMemoryResult result2 = MachineMappingWithMemoryResult{
+        {
+            mm2,
+            mm3,
+        },
+    };
+
+    MachineMappingWithMemoryResult result = minimize_runtime(result1, result2);
+    MachineMappingWithMemoryResult correct = MachineMappingWithMemoryResult{
+        {
+            mm1,
+            mm2,
+        },
+    };
+
+    CHECK(result == correct);
+  }
+}
diff --git a/lib/kernels/include/kernels/array_shape.h b/lib/kernels/include/kernels/array_shape.h
index 96a3b3b281..326c6922f9 100644
--- a/lib/kernels/include/kernels/array_shape.h
+++ b/lib/kernels/include/kernels/array_shape.h
@@ -3,7 +3,7 @@
 
 #include "legion_dim.h"
 #include "op-attrs/tensor_shape.dtg.h"
-#include "utils/stack_vector.h"
+#include "utils/stack_vector/stack_vector.h"
 #include "utils/visitable.h"
 #include <cstddef>
 #include <optional>
diff --git a/lib/kernels/src/legion_dim.cc b/lib/kernels/src/legion_dim.cc
index 9ef47d40ae..142dcbcb2c 100644
--- a/lib/kernels/src/legion_dim.cc
+++ b/lib/kernels/src/legion_dim.cc
@@ -7,7 +7,7 @@ legion_dim_t add_to_legion_dim(legion_dim_t legion_dim, int value) {
 }
 
 legion_dim_t legion_dim_from_ff_dim(ff_dim_t ff_dim, int num_dimensions) {
-  return legion_dim_t(num_dimensions - ff_dim.value - 1);
+  return legion_dim_t(num_dimensions - ff_dim.value.get_value() - 1);
 }
 
 } // namespace FlexFlow
diff --git a/lib/kernels/test/CMakeLists.txt b/lib/kernels/test/CMakeLists.txt
index 007740b510..00da2d0d70 100644
--- a/lib/kernels/test/CMakeLists.txt
+++ b/lib/kernels/test/CMakeLists.txt
@@ -15,3 +15,10 @@ ff_add_test_executable(
     cudart
     cublas
 )
+
+set(FF_TEST_EXEC_NAME "kernels-tests")
+add_custom_command(
+  TARGET ${FF_TEST_EXEC_NAME} POST_BUILD
+  COMMAND ${CMAKE_COMMAND} -DFF_TEST_EXEC_NAME=${FF_TEST_EXEC_NAME} -P ${CMAKE_CURRENT_LIST_DIR}/modify_test_commands.cmake
+  DEPENDS ${FF_TEST_EXEC_NAME}
+)
diff --git a/lib/kernels/test/modify_test_commands.cmake b/lib/kernels/test/modify_test_commands.cmake
new file mode 100644
index 0000000000..6494ae2d78
--- /dev/null
+++ b/lib/kernels/test/modify_test_commands.cmake
@@ -0,0 +1,21 @@
+# modify_test_commands.cmake
+
+file(GLOB ctest_tests_files "${CMAKE_CURRENT_BINARY_DIR}/${FF_TEST_EXEC_NAME}_tests-*.cmake")
+
+foreach(ctest_tests_file IN LISTS ctest_tests_files)
+  file(READ "${ctest_tests_file}" content)
+
+  # add nix run prefix
+  string(REGEX REPLACE 
+    "add_test\\([ \t\r\n]*\\[==\\[([^]]+)\\]==\\][ \t\r\n]+([^ ]+)[ \t\r\n]+\\[==\\[([^]]+)\\]==\\]\\)" 
+    "add_test( [==[\\1]==] nixGL -- \\2 [==[\\3]==])" 
+    content "${content}")
+
+  # add environment
+  # string(REGEX REPLACE 
+  #   "set_tests_properties\\([ \t\r\n]*\\[==\\[([^]]+)\\]==\\][ \t\r\n]+PROPERTIES[ \t\r\n]+([^)]+)\\)" 
+  #   "set_tests_properties( [==[\\1]==] PROPERTIES \\2 ENVIRONMENT \"NIXPKGS_ALLOW_UNFREE=1\")" 
+  #   content "${content}")
+
+  file(WRITE "${ctest_tests_file}" "${content}")
+endforeach()
diff --git a/lib/kernels/test/src/test_concat_kernel.cc b/lib/kernels/test/src/test_concat_kernel.cc
index bf2a521b4e..2212e384fa 100644
--- a/lib/kernels/test/src/test_concat_kernel.cc
+++ b/lib/kernels/test/src/test_concat_kernel.cc
@@ -7,7 +7,7 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test concat kernel forward and backward") {
     size_t num_inputs = 3;
     size_t size_per_input = 100;
-    ff_dim_t concat_axis = ff_dim_t(0);
+    ff_dim_t concat_axis = ff_dim_t{nonnegative_int{0}};
 
     ManagedPerDeviceFFHandle managed_handle{};
     ManagedFFStream managed_stream{};
@@ -21,7 +21,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("forward_kernel") {
       std::vector<GenericTensorAccessorR> input_accessors =
-          repeat(num_inputs, [&]() {
+          repeat<GenericTensorAccessorR>(num_inputs, [&]() {
             return read_only_accessor_from_write_accessor(
                 create_random_filled_accessor_w(input_shape, allocator));
           });
@@ -44,9 +44,10 @@ TEST_SUITE(FF_TEST_SUITE) {
       GenericTensorAccessorR output_grad_accessor =
           read_only_accessor_from_write_accessor(
               create_random_filled_accessor_w(output_shape, allocator));
-      std::vector<GenericTensorAccessorW> input_grad_accessors = repeat(
-          num_inputs, [&]() { return allocator.allocate_tensor(input_shape); });
-
+      std::vector<GenericTensorAccessorW> input_grad_accessors =
+          repeat<GenericTensorAccessorW>(num_inputs, [&]() {
+            return allocator.allocate_tensor(input_shape);
+          });
       Kernels::Concat::backward_kernel(managed_stream.raw_stream(),
                                        output_grad_accessor,
                                        input_grad_accessors,
diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc
index 81f3c7183a..e29143e251 100644
--- a/lib/kernels/test/src/test_dropout.cc
+++ b/lib/kernels/test/src/test_dropout.cc
@@ -25,7 +25,8 @@ TEST_SUITE(FF_TEST_SUITE) {
         managed_handle.raw_handle(), dropout_rate, seed, shape, allocator);
 
     auto get_zero_count = [](std::vector<float> const &data) {
-      return count(data, [](float x) { return x == 0.0f; });
+      return std::count_if(
+          data.begin(), data.end(), [](float x) { return x == 0.0f; });
     };
 
     SUBCASE("forward_kernel") {
diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc
index 7cc2b28c9e..f2346c9244 100644
--- a/lib/kernels/test/src/test_split_kernel.cc
+++ b/lib/kernels/test/src/test_split_kernel.cc
@@ -23,7 +23,8 @@ TEST_SUITE(FF_TEST_SUITE) {
       GenericTensorAccessorW input_accessor =
           create_random_filled_accessor_w(input_shape, allocator);
 
-      std::vector<float *> output_ptrs = repeat(num_outputs, [&]() {
+      std::vector<float *> output_ptrs(num_outputs);
+      generate_n(output_ptrs.begin(), num_outputs, [&]() {
         GenericTensorAccessorW output_accessor =
             allocator.allocate_tensor(output_shape);
         return output_accessor.get_float_ptr();
diff --git a/lib/kernels/test/src/test_transpose_kernel.cc b/lib/kernels/test/src/test_transpose_kernel.cc
index 2fc186a257..2904fa01ae 100644
--- a/lib/kernels/test/src/test_transpose_kernel.cc
+++ b/lib/kernels/test/src/test_transpose_kernel.cc
@@ -7,7 +7,8 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test Transpose Kernel Operations") {
     std::size_t num_dims = 2;
 
-    std::vector<ff_dim_t> perm = {ff_dim_t(0), ff_dim_t(1)};
+    std::vector<ff_dim_t> perm = {ff_dim_t{nonnegative_int{0}},
+                                  ff_dim_t{nonnegative_int{1}}};
 
     ManagedPerDeviceFFHandle managed_handle{};
     ManagedFFStream managed_stream{};
diff --git a/lib/kernels/test/src/test_utils.h b/lib/kernels/test/src/test_utils.h
index abce3fd444..21d4923881 100644
--- a/lib/kernels/test/src/test_utils.h
+++ b/lib/kernels/test/src/test_utils.h
@@ -5,7 +5,13 @@
 #include "kernels/local_cuda_allocator.h"
 #include "kernels/managed_ff_stream.h"
 #include "kernels/managed_per_device_ff_handle.h"
+#include <doctest/doctest.h>
 #include <random>
+#include <sstream>
+#include <string>
+#include <vector>
+
+using namespace FlexFlow;
 
 GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape,
                                                        Allocator &allocator,
@@ -42,7 +48,33 @@ std::vector<T> load_data_to_host_from_device(GenericTensorAccessorR accessor) {
 
 template <typename T>
 bool contains_non_zero(std::vector<T> &data) {
-  return !all_of(data, [](T const &val) { return val == 0; });
+  return !all_of(
+      data.begin(), data.end(), [](T const &val) { return val == 0; });
+}
+
+template <typename T, typename Func>
+std::vector<T> repeat(std::size_t n, Func &&func) {
+  std::vector<T> result;
+  // result.reserve(n); // Sometimes we don't have default constructor for T
+  for (std::size_t i = 0; i < n; ++i) {
+    result.push_back(func());
+  }
+  return result;
 }
 
+// Specialize doctest's StringMaker for std::vector<float>
+template <>
+struct doctest::StringMaker<std::vector<float>> {
+  static doctest::String convert(std::vector<float> const &vec) {
+    std::ostringstream oss;
+    for (size_t i = 0; i < vec.size(); ++i) {
+      oss << vec[i];
+      if (i != vec.size() - 1) {
+        oss << ", ";
+      }
+    }
+    return doctest::String(("[" + oss.str() + "]").c_str());
+  }
+};
+
 #endif
diff --git a/lib/local-execution/include/local-execution/legion_tensor_shape.h b/lib/local-execution/include/local-execution/legion_tensor_shape.h
index 2f2ed50d41..3786383865 100644
--- a/lib/local-execution/include/local-execution/legion_tensor_shape.h
+++ b/lib/local-execution/include/local-execution/legion_tensor_shape.h
@@ -3,9 +3,9 @@
 
 #include "kernels/legion_dim.h"
 #include "op-attrs/datatype.h"
-#include "op-attrs/ff_dim.h"
+#include "op-attrs/ff_dim_t.h"
 #include "op-attrs/tensor_shape.dtg.h"
-#include "utils/stack_vector.h"
+#include "utils/stack_vector/stack_vector.h"
 #include "utils/visitable.h"
 #include <cstddef>
 
diff --git a/lib/local-execution/src/legion_tensor_shape.cc b/lib/local-execution/src/legion_tensor_shape.cc
index bce29fafeb..b227accc2e 100644
--- a/lib/local-execution/src/legion_tensor_shape.cc
+++ b/lib/local-execution/src/legion_tensor_shape.cc
@@ -1,14 +1,15 @@
 #include "local-execution/legion_tensor_shape.h"
+#include "kernels/legion_dim.h"
 #include "op-attrs/tensor_shape.h"
 
 namespace FlexFlow {
 
 legion_dim_t legion_dim_from_ff_dim(ff_dim_t ff_dim, size_t num_dims) {
-  return legion_dim_t(num_dims - ff_dim.value - 1);
+  return legion_dim_t(num_dims - ff_dim.value.get_value() - 1);
 }
 
 legion_dim_t legion_dim_from_ff_dim(ff_dim_t ff_dim, TensorShape const &shape) {
-  return legion_dim_t(num_dims(shape) - ff_dim.value - 1);
+  return legion_dim_from_ff_dim(ff_dim, num_dims(shape));
 }
 
 } // namespace FlexFlow
diff --git a/lib/local-execution/src/ops/linear.cc b/lib/local-execution/src/ops/linear.cc
index 9934e2a45c..3e0b4672ab 100644
--- a/lib/local-execution/src/ops/linear.cc
+++ b/lib/local-execution/src/ops/linear.cc
@@ -1,7 +1,7 @@
 #include "linear.h"
 #include "kernels/linear_kernels.h"
 #include "local-execution/task_argument_accessor.h"
-#include "op-attrs/ff_dim.h"
+#include "op-attrs/ff_dim_t.h"
 #include "op-attrs/get_output_shapes.h"
 #include "utils/exception.h"
 #include "utils/hash-utils.h"
@@ -66,8 +66,8 @@ static DeviceSpecificDeviceStates
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto weight = acc.get_tensor<Permissions::RO>(WEIGHT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-  int out_dim = output.shape.at(ff_dim_t{0});
-  int batch_size = output.shape.at(ff_dim_t{1});
+  int out_dim = output.shape.at(ff_dim_t{nonnegative_int{0}});
+  int batch_size = output.shape.at(ff_dim_t{nonnegative_int{1}});
 
   float *one_ptr;
 
@@ -96,8 +96,8 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto attrs = acc.get_argument<LinearAttrs>(ATTRS);
 
-  int in_dim = input.shape.at(ff_dim_t{0}) + 1;
-  int out_dim = output.shape.at(ff_dim_t{0}) + 1;
+  int in_dim = input.shape.at(ff_dim_t{nonnegative_int{0}}) + 1;
+  int out_dim = output.shape.at(ff_dim_t{nonnegative_int{0}}) + 1;
   int batch_size = output.shape.get_volume() / out_dim;
 
   float const *bias_ptr = NULL;
@@ -140,8 +140,8 @@ static std::optional<float>
     bias_ptr = bias.get_float_ptr();
   }
 
-  int in_dim = input.shape.at(ff_dim_t{0}) + 1;
-  int out_dim = output.shape.at(ff_dim_t{0}) + 1;
+  int in_dim = input.shape.at(ff_dim_t{nonnegative_int{0}}) + 1;
+  int out_dim = output.shape.at(ff_dim_t{nonnegative_int{0}}) + 1;
   int batch_size = output.shape.get_volume() / out_dim;
 
   return profile(backward_kernel,
diff --git a/lib/local-execution/src/ops/pool_2d.cc b/lib/local-execution/src/ops/pool_2d.cc
index 33d62b713c..3ab33a2ad6 100644
--- a/lib/local-execution/src/ops/pool_2d.cc
+++ b/lib/local-execution/src/ops/pool_2d.cc
@@ -30,14 +30,14 @@ static DeviceSpecificDeviceStates
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
 
-  int input_w = input.shape.at(ff_dim_t(0)) + 1;
-  int input_h = input.shape.at(ff_dim_t(1)) + 1;
-  int input_c = input.shape.at(ff_dim_t(2)) + 1;
-  int input_n = input.shape.at(ff_dim_t(3)) + 1;
-  int output_w = output.shape.at(ff_dim_t(0)) + 1;
-  int output_h = output.shape.at(ff_dim_t(1)) + 1;
-  int output_c = output.shape.at(ff_dim_t(2)) + 1;
-  int output_n = output.shape.at(ff_dim_t(3)) + 1;
+  int input_w = input.shape.at(ff_dim_t{nonnegative_int{0}}) + 1;
+  int input_h = input.shape.at(ff_dim_t{nonnegative_int{1}}) + 1;
+  int input_c = input.shape.at(ff_dim_t{nonnegative_int{2}}) + 1;
+  int input_n = input.shape.at(ff_dim_t{nonnegative_int{3}}) + 1;
+  int output_w = output.shape.at(ff_dim_t{nonnegative_int{0}}) + 1;
+  int output_h = output.shape.at(ff_dim_t{nonnegative_int{1}}) + 1;
+  int output_c = output.shape.at(ff_dim_t{nonnegative_int{2}}) + 1;
+  int output_n = output.shape.at(ff_dim_t{nonnegative_int{3}}) + 1;
 
   printf("init pool (input): n(%d) c(%d) h(%d) "
          "w(%d)\n",
diff --git a/lib/local-execution/src/ops/reverse.cc b/lib/local-execution/src/ops/reverse.cc
index 366a579bea..8ac4c045c7 100644
--- a/lib/local-execution/src/ops/reverse.cc
+++ b/lib/local-execution/src/ops/reverse.cc
@@ -53,11 +53,11 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   coord_t in_blk_size = 1, reverse_dim_size = 1, num_out_blks = 1;
   for (int i = 0; i < output.shape.get_dim(); i++) {
     if (i < axis.value) {
-      in_blk_size *= output.shape.at(ff_dim_t(i));
+      in_blk_size *= output.shape.at(ff_dim_t{nonnegative_int{i}});
     } else if (i == axis.value) {
-      reverse_dim_size = output.shape.at(ff_dim_t(i));
+      reverse_dim_size = output.shape.at(ff_dim_t{nonnegative_int{i}});
     } else {
-      num_out_blks *= output.shape.at(ff_dim_t(i));
+      num_out_blks *= output.shape.at(ff_dim_t{nonnegative_int{i}});
     }
   }
 
@@ -79,15 +79,15 @@ static std::optional<float>
   auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
   auto attrs = acc.get_argument<ReverseAttrs>(ATTRS);
 
-  int axis = input_grad.shape.get_dim() - attrs.axis.value - 1;
+  int axis = input_grad.shape.get_dim() - attrs.axis.value.get_value() - 1;
   coord_t in_blk_size = 1, reverse_dim_size = 1, num_out_blks = 1;
   for (int i = 0; i < input_grad.shape.get_dim(); i++) {
     if (i < axis) {
-      in_blk_size *= input_grad.shape.at(ff_dim_t(i));
+      in_blk_size *= input_grad.shape.at(ff_dim_t{nonnegative_int{i}});
     } else if (i == axis) {
-      reverse_dim_size = input_grad.shape.at(ff_dim_t(i));
+      reverse_dim_size = input_grad.shape.at(ff_dim_t{nonnegative_int{i}});
     } else {
-      num_out_blks *= input_grad.shape.at(ff_dim_t(i));
+      num_out_blks *= input_grad.shape.at(ff_dim_t{nonnegative_int{i}});
     }
   }
 
diff --git a/lib/local-execution/src/ops/softmax.cc b/lib/local-execution/src/ops/softmax.cc
index 4c7979ae9b..8d412c739b 100644
--- a/lib/local-execution/src/ops/softmax.cc
+++ b/lib/local-execution/src/ops/softmax.cc
@@ -64,8 +64,13 @@ static DeviceSpecificDeviceStates
   int output_c = output.shape.at(legion_dim_t(2));
   int output_n = output.shape.at(legion_dim_t(3));
 
-  SoftmaxPerDeviceState per_device_state = init_kernel(
-      handle, attrs.dim.value, output_n, output_c, output_h, output_w);
+  SoftmaxPerDeviceState per_device_state =
+      init_kernel(handle,
+                  attrs.dim.value.get_value(),
+                  output_n,
+                  output_c,
+                  output_h,
+                  output_w);
 
   return DeviceSpecificDeviceStates{
       DeviceSpecific<SoftmaxPerDeviceState>::create(per_device_state)};
diff --git a/lib/local-execution/src/ops/split.cc b/lib/local-execution/src/ops/split.cc
index 9f039d84f8..c289bca205 100644
--- a/lib/local-execution/src/ops/split.cc
+++ b/lib/local-execution/src/ops/split.cc
@@ -47,11 +47,11 @@ OpTaskInvocation backward(SplitAttrs const &attrs) {
 void calc_block_size(coord_t &num_blocks,
                      coord_t &block_size,
                      ArrayShape const &array_shape,
-                     int axis) {
+                     ff_dim_t axis) {
   num_blocks = 1;
   block_size = 1;
   for (int d = 0; d < array_shape.num_elements(); d++) {
-    if (d <= axis) {
+    if (d <= axis.value.get_value()) {
       block_size *= array_shape.at(legion_dim_t(d));
     } else {
       num_blocks *= array_shape.at(legion_dim_t(d));
@@ -66,12 +66,12 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto attrs = acc.get_argument<SplitAttrs>(ATTRS);
 
   coord_t num_blocks, in_block_size, out_block_size[MAX_NUM_OUTPUTS];
-  calc_block_size(num_blocks, in_block_size, input.shape, attrs.axis.value);
+  calc_block_size(num_blocks, in_block_size, input.shape, attrs.axis);
 
   for (int i = 0; i < attrs.splits.size(); i++) {
     coord_t out_num_blocks;
     calc_block_size(
-        out_num_blocks, out_block_size[i], output.shape, attrs.axis.value);
+        out_num_blocks, out_block_size[i], output.shape, attrs.axis);
   }
   float *output_float_ptr = output.get_float_ptr();
   return profile(forward_kernel,
@@ -94,12 +94,11 @@ static std::optional<float>
   auto attrs = acc.get_argument<SplitAttrs>(ATTRS);
 
   coord_t num_blocks, in_block_size, out_block_size[MAX_NUM_OUTPUTS];
-  calc_block_size(
-      num_blocks, in_block_size, input_grad.shape, attrs.axis.value);
+  calc_block_size(num_blocks, in_block_size, input_grad.shape, attrs.axis);
   for (int i = 0; i < attrs.splits.size(); i++) {
     coord_t out_num_blocks;
     calc_block_size(
-        out_num_blocks, out_block_size[i], output_grad.shape, attrs.axis.value);
+        out_num_blocks, out_block_size[i], output_grad.shape, attrs.axis);
   }
   float const *output_grad_ptr = output_grad.get_float_ptr();
   return profile(backward_kernel,
diff --git a/lib/local-execution/src/ops/transpose.cc b/lib/local-execution/src/ops/transpose.cc
index 3e4ac15db3..53cf1f20ed 100644
--- a/lib/local-execution/src/ops/transpose.cc
+++ b/lib/local-execution/src/ops/transpose.cc
@@ -17,6 +17,7 @@
 #include "kernels/transpose_kernels.h"
 #include "op-attrs/get_output_shapes.h"
 #include "op-attrs/ops/transpose.h"
+#include "utils/integer_conversions.h"
 
 using namespace FlexFlow::Kernels::Transpose;
 
@@ -39,8 +40,17 @@ OpTaskInvocation init(TransposeAttrs const &attrs) {
 static DeviceSpecificDeviceStates
     init_task_impl(TaskArgumentAccessor const &acc) {
   auto const &attrs = acc.get_argument<TransposeAttrs>(ATTRS);
-  std::vector<ff_dim_t> perm = inner_to_outer_idxs(attrs.perm);
-  TransposePerDeviceState per_device_state = init_kernel(perm.size(), perm);
+  int size = int_from_size_t(attrs.perm.size());
+
+  std::vector<ff_dim_t> perm = [&] {
+    std::vector<ff_dim_t> result;
+    for (int i : range(size)) {
+      result.push_back(ff_dim_t{nonnegative_int{size - i - 1}});
+    }
+    return result;
+  }();
+
+  TransposePerDeviceState per_device_state = init_kernel(size, perm);
 
   return DeviceSpecificDeviceStates{
       DeviceSpecific<TransposePerDeviceState>::create(per_device_state)};
diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h b/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h
index 6aa23d40fc..3977f4e0fd 100644
--- a/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h
+++ b/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h
@@ -1,9 +1,11 @@
 #ifndef _FLEXFLOW_OPATTRS_INCLUDE_OPATTRS_FF_STACK_VECTOR_H
 #define _FLEXFLOW_OPATTRS_INCLUDE_OPATTRS_FF_STACK_VECTOR_H
 
-#include "op-attrs/ff_dim.dtg.h"
+#include "op-attrs/ff_dim_t.dtg.h"
+#include "op-attrs/relative_ff_dim_t.dtg.h"
+#include "utils/containers/range.h"
 #include "utils/fmt/vector.h"
-#include "utils/stack_vector.h"
+#include "utils/stack_vector/stack_vector.h"
 #include <nlohmann/json.hpp>
 
 namespace FlexFlow {
@@ -152,6 +154,169 @@ struct DimOrdered {
   stack_vector<T, MAX_TENSOR_DIM> contents;
 };
 
+template <typename T>
+struct DimOrdered<ff_dim_t, T> {
+  DimOrdered() {}
+
+  DimOrdered(std::initializer_list<T> const &l)
+      : contents(l.begin(), l.end()) {}
+
+  DimOrdered(std::vector<T> const &contents)
+      : contents(contents.begin(), contents.end()) {}
+
+  template <typename It>
+  DimOrdered(It begin, It end) : contents(begin, end) {}
+
+  template <size_t MAXSIZE>
+  DimOrdered(stack_vector<T, MAXSIZE> const &contents)
+      : contents(contents.begin(), contents.end()) {}
+
+  T const &at(ff_dim_t idx) const {
+    int raw = idx.value.get_value();
+    return this->contents.at(raw);
+  }
+
+  T const &at(relative_ff_dim_t idx) const {
+    int raw = idx.value;
+    if (raw < 0) {
+      raw = this->contents.size() + raw;
+    }
+    return this->contents.at(raw);
+  }
+
+  T &at(ff_dim_t idx) {
+    int raw = idx.value.get_value();
+    return this->contents.at(raw);
+  }
+
+  T &at(relative_ff_dim_t idx) {
+    int raw = idx.value;
+    if (raw < 0) {
+      raw = this->contents.size() + raw;
+    }
+    return this->contents.at(raw);
+  }
+
+  T const &operator[](ff_dim_t idx) const {
+    return this->at(idx);
+  }
+
+  T const &operator[](relative_ff_dim_t idx) const {
+    return this->at(idx);
+  }
+
+  T &operator[](ff_dim_t idx) {
+    return this->at(idx);
+  }
+
+  T &operator[](relative_ff_dim_t idx) {
+    return this->at(idx);
+  }
+
+  bool idx_is_valid(ff_dim_t const &idx) const {
+    int raw = idx.value.get_value();
+    return raw < this->contents.size();
+  }
+
+  bool idx_is_valid(relative_ff_dim_t const &idx) const {
+    int raw = idx.value;
+    if (raw < 0) {
+      raw = this->contents.size() + raw;
+    }
+    return (raw >= 0 && raw < this->contents.size());
+  }
+
+  bool operator==(DimOrdered const &other) const {
+    return this->contents == other.contents;
+  }
+
+  bool operator!=(DimOrdered const &other) const {
+    return this->contents != other.contents;
+  }
+
+  bool operator<(DimOrdered const &other) const {
+    return this->contents < other.contents;
+  }
+
+  using iterator = typename stack_vector<T, MAX_TENSOR_DIM>::iterator;
+  using const_iterator =
+      typename stack_vector<T, MAX_TENSOR_DIM>::const_iterator;
+  using reverse_iterator =
+      typename stack_vector<T, MAX_TENSOR_DIM>::reverse_iterator;
+  using const_reverse_iterator =
+      typename stack_vector<T, MAX_TENSOR_DIM>::const_reverse_iterator;
+  using value_type = T;
+  using pointer = value_type *;
+  using const_pointer = value_type const *;
+  using reference = value_type &;
+  using const_reference = value_type const &;
+
+  iterator begin() {
+    return this->contents.begin();
+  }
+
+  const_iterator begin() const {
+    return this->cbegin();
+  }
+
+  const_iterator cbegin() const {
+    return this->contents.cbegin();
+  }
+
+  iterator end() {
+    return this->contents.end();
+  }
+
+  const_iterator end() const {
+    return this->cend();
+  }
+
+  const_iterator cend() const {
+    return this->contents.cend();
+  }
+
+  reverse_iterator rbegin() {
+    return this->contents.rbegin();
+  }
+
+  const_reverse_iterator rbegin() const {
+    return this->crbegin();
+  }
+
+  const_reverse_iterator crbegin() const {
+    return this->contents.crbegin();
+  }
+
+  reverse_iterator rend() {
+    return this->contents.crend();
+  }
+
+  const_reverse_iterator rend() const {
+    return this->crend();
+  }
+
+  const_reverse_iterator crend() const {
+    return this->contents.crend();
+  }
+
+  size_t size() const {
+    return this->contents.size();
+  }
+
+  size_t empty() const {
+    return this->contents.empty();
+  }
+
+  size_t num_dims() const {
+    return this->size();
+  }
+
+  friend struct ::std::hash<DimOrdered>;
+
+private:
+  stack_vector<T, MAX_TENSOR_DIM> contents;
+};
+
 template <typename T>
 using FFOrdered = DimOrdered<ff_dim_t, T>;
 
@@ -166,31 +331,6 @@ std::ostream &operator<<(std::ostream &s, FFOrdered<T> const &v) {
   return (s << fmt::to_string(v));
 }
 
-template <typename T>
-auto inner_to_outer(FFOrdered<T> const &ff_ordered)
-    -> decltype(reversed_container(ff_ordered)) {
-  return reversed_container(ff_ordered);
-}
-
-template <typename T>
-std::vector<ff_dim_t> inner_to_outer_idxs(FFOrdered<T> const &ff_ordered) {
-  std::vector<ff_dim_t> idxs;
-  for (size_t i = 0; i < ff_ordered.size(); i++) {
-    idxs.push_back(ff_dim_t(ff_ordered.size() - i - 1));
-  }
-  return idxs;
-}
-
-template <typename T>
-std::vector<ff_dim_t> outer_to_inner_idxs(FFOrdered<T> const &ff_ordered) {
-  return reversed(inner_to_outer_idxs<T>(ff_ordered));
-}
-
-template <typename T>
-FFOrdered<T> const &outer_to_inner(FFOrdered<T> const &ff_ordered) {
-  return ff_ordered;
-}
-
 } // namespace FlexFlow
 
 /* template <typename Idx, typename T> */
diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/enumerate.h b/lib/op-attrs/include/op-attrs/dim_ordered/enumerate.h
index 38e7da4bb2..9e4271a1ff 100644
--- a/lib/op-attrs/include/op-attrs/dim_ordered/enumerate.h
+++ b/lib/op-attrs/include/op-attrs/dim_ordered/enumerate.h
@@ -19,7 +19,7 @@ template <typename T>
 std::map<ff_dim_t, T> enumerate(FFOrdered<T> const &ff_ordered) {
   std::map<ff_dim_t, T> result;
   for (int raw_ff_dim : count(ff_ordered.size())) {
-    ff_dim_t ff_dim = ff_dim_t{raw_ff_dim};
+    ff_dim_t ff_dim = ff_dim_t{nonnegative_int{raw_ff_dim}};
     result.insert({ff_dim, ff_ordered.at(ff_dim)});
   }
   return result;
diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/ff_ordered_from_map.h b/lib/op-attrs/include/op-attrs/dim_ordered/ff_ordered_from_map.h
index 79d4929797..f8f49233ec 100644
--- a/lib/op-attrs/include/op-attrs/dim_ordered/ff_ordered_from_map.h
+++ b/lib/op-attrs/include/op-attrs/dim_ordered/ff_ordered_from_map.h
@@ -3,6 +3,7 @@
 
 #include "op-attrs/dim_ordered/dim_ordered.h"
 #include "op-attrs/dim_ordered/ff_ordered_of.h"
+#include "op-attrs/ff_dim_t.h"
 
 namespace FlexFlow {
 
@@ -10,7 +11,7 @@ template <typename T>
 FFOrdered<T> ff_ordered_from_map(std::map<ff_dim_t, T> const &m) {
   std::vector<T> raw;
   for (int i = 0; i < m.size(); i++) {
-    raw.push_back(m.at(ff_dim_t{i}));
+    raw.push_back(m.at(ff_dim_t{nonnegative_int{i}}));
   }
   return ff_ordered_of(raw);
 }
@@ -19,7 +20,7 @@ template <typename T>
 FFOrdered<T> ff_ordered_from_map(std::unordered_map<ff_dim_t, T> const &m) {
   std::vector<T> raw;
   for (int i = 0; i < m.size(); i++) {
-    raw.push_back(m.at(ff_dim_t{i}));
+    raw.push_back(m.at(ff_dim_t{nonnegative_int{i}}));
   }
   return ff_ordered_of(raw);
 }
diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/get_idxs.h b/lib/op-attrs/include/op-attrs/dim_ordered/get_idxs.h
index 7343dc0e69..4e7f8530a4 100644
--- a/lib/op-attrs/include/op-attrs/dim_ordered/get_idxs.h
+++ b/lib/op-attrs/include/op-attrs/dim_ordered/get_idxs.h
@@ -2,6 +2,7 @@
 #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_GET_IDXS_H
 
 #include "op-attrs/dim_ordered/dim_ordered.h"
+#include "op-attrs/ff_dim_t.h"
 #include "utils/containers/count.h"
 #include "utils/containers/transform.h"
 
@@ -9,7 +10,8 @@ namespace FlexFlow {
 
 template <typename T>
 std::vector<ff_dim_t> get_idxs(FFOrdered<T> const &d) {
-  return transform(count(d.size()), [](int i) { return ff_dim_t{i}; });
+  return transform(count(d.size()),
+                   [](int i) { return ff_dim_t{nonnegative_int{i}}; });
 }
 
 } // namespace FlexFlow
diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/slice.h b/lib/op-attrs/include/op-attrs/dim_ordered/slice.h
index e4c0e8e275..c9e6db4d17 100644
--- a/lib/op-attrs/include/op-attrs/dim_ordered/slice.h
+++ b/lib/op-attrs/include/op-attrs/dim_ordered/slice.h
@@ -22,40 +22,50 @@ DimOrdered<Idx, T> nonoverloaded_slice(DimOrdered<Idx, T> const &d,
 }
 
 template <typename T>
-FFOrdered<T> slice(FFOrdered<T> const &d,
-                   std::optional<ff_dim_t> const &start,
-                   std::optional<ff_dim_t> const &end) {
-  return nonoverloaded_slice(d, start, end);
+FFOrdered<T> ff_dim_t_nonoverloaded_slice(FFOrdered<T> const &d,
+                                          std::optional<ff_dim_t> const &start,
+                                          std::optional<ff_dim_t> const &end) {
+  auto to_raw_idx =
+      [](std::optional<ff_dim_t> const &idx) -> std::optional<int> {
+    return transform(idx,
+                     [](ff_dim_t const &i) { return i.value.get_value(); });
+  };
+
+  return FFOrdered<T>{subvec(vector_of(d), to_raw_idx(start), to_raw_idx(end))};
 }
 
-template <typename Idx, typename T>
-DimOrdered<Idx, T> slice(DimOrdered<Idx, T> const &d,
-                         std::optional<Idx> const &start,
-                         std::optional<Idx> const &end) {
-  return nonoverloaded_slice(d, start, end);
+template <typename T>
+FFOrdered<T> relative_ff_dim_t_nonoverloaded_slice(
+    FFOrdered<T> const &d,
+    std::optional<relative_ff_dim_t> const &start,
+    std::optional<relative_ff_dim_t> const &end) {
+  auto to_raw_idx =
+      [](std::optional<relative_ff_dim_t> const &idx) -> std::optional<int> {
+    return transform(idx, [](relative_ff_dim_t const &i) { return i.value; });
+  };
+
+  return FFOrdered<T>{subvec(vector_of(d), to_raw_idx(start), to_raw_idx(end))};
 }
 
 template <typename Idx, typename T>
 DimOrdered<Idx, T> slice(DimOrdered<Idx, T> const &d,
-                         std::nullopt_t const &start,
-                         Idx const &end) {
-  return nonoverloaded_slice(
-      d, std::optional<Idx>{start}, std::optional<Idx>{end});
+                         std::optional<Idx> const &start = std::nullopt,
+                         std::optional<Idx> const &end = std::nullopt) {
+  return ff_dim_t_nonoverloaded_slice(d, start, end);
 }
 
-template <typename Idx, typename T>
-DimOrdered<Idx, T> slice(DimOrdered<Idx, T> const &d,
-                         Idx const &start,
-                         std::nullopt_t const &end) {
-  return nonoverloaded_slice(
-      d, std::optional<Idx>{start}, std::optional<Idx>{end});
+template <typename T>
+FFOrdered<T> slice(FFOrdered<T> const &d,
+                   std::optional<ff_dim_t> const &start = std::nullopt,
+                   std::optional<ff_dim_t> const &end = std::nullopt) {
+  return ff_dim_t_nonoverloaded_slice(d, start, end);
 }
 
-template <typename Idx, typename T>
-DimOrdered<Idx, T>
-    slice(DimOrdered<Idx, T> const &d, Idx const &start, Idx const &end) {
-  return nonoverloaded_slice(
-      d, std::optional<Idx>{start}, std::optional<Idx>{end});
+template <typename T>
+FFOrdered<T> slice(FFOrdered<T> const &d,
+                   std::optional<relative_ff_dim_t> const &start = std::nullopt,
+                   std::optional<relative_ff_dim_t> const &end = std::nullopt) {
+  return relative_ff_dim_t_nonoverloaded_slice(d, start, end);
 }
 
 } // namespace FlexFlow
diff --git a/lib/op-attrs/include/op-attrs/ff_dim.h b/lib/op-attrs/include/op-attrs/ff_dim.h
deleted file mode 100644
index e78ce4b51e..0000000000
--- a/lib/op-attrs/include/op-attrs/ff_dim.h
+++ /dev/null
@@ -1,18 +0,0 @@
-
-#ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_FF_DIM_H
-#define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_FF_DIM_H
-
-#include "op-attrs/ff_dim.dtg.h"
-#include "rapidcheck.h"
-
-namespace rc {
-template <>
-struct Arbitrary<FlexFlow::ff_dim_t> {
-  static Gen<FlexFlow::ff_dim_t> arbitrary() {
-    return gen::construct<FlexFlow::ff_dim_t>(
-        gen::inRange<int>(0, MAX_TENSOR_DIM));
-  }
-};
-} // namespace rc
-
-#endif // _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_FF_DIM_H
diff --git a/lib/op-attrs/include/op-attrs/ff_dim_t.h b/lib/op-attrs/include/op-attrs/ff_dim_t.h
new file mode 100644
index 0000000000..5fab792b13
--- /dev/null
+++ b/lib/op-attrs/include/op-attrs/ff_dim_t.h
@@ -0,0 +1,19 @@
+#ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_FF_DIM_T_H
+#define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_FF_DIM_T_H
+
+#include "op-attrs/ff_dim_t.dtg.h"
+#include "op-attrs/relative_ff_dim_t.dtg.h"
+#include "rapidcheck.h"
+
+namespace FlexFlow {
+relative_ff_dim_t relative_ff_dim_t_from_ff_dim_t(ff_dim_t ff_dim);
+} // namespace FlexFlow
+
+namespace rc {
+template <>
+struct Arbitrary<::FlexFlow::ff_dim_t> {
+  static Gen<::FlexFlow::ff_dim_t> arbitrary();
+};
+} // namespace rc
+
+#endif // _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_FF_DIM_T_H
diff --git a/lib/op-attrs/include/op-attrs/ff_dim_t.struct.toml b/lib/op-attrs/include/op-attrs/ff_dim_t.struct.toml
new file mode 100644
index 0000000000..38f51da4a1
--- /dev/null
+++ b/lib/op-attrs/include/op-attrs/ff_dim_t.struct.toml
@@ -0,0 +1,18 @@
+namespace = "FlexFlow"
+name = "ff_dim_t"
+
+features = [
+  "eq",
+  "ord",
+  "hash",
+  "json",
+  "fmt",
+]
+
+includes = [
+  "utils/nonnegative_int/nonnegative_int.h"
+]
+
+[[fields]]
+name = "value"
+type = "::FlexFlow::nonnegative_int"
diff --git a/lib/op-attrs/include/op-attrs/ops/combine_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/combine_attrs.struct.toml
index 585295fe1c..e7eeedec06 100644
--- a/lib/op-attrs/include/op-attrs/ops/combine_attrs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/combine_attrs.struct.toml
@@ -10,8 +10,8 @@ features = [
 ]
 
 includes = [
-  "op-attrs/ff_dim.h",
-  "op-attrs/ff_dim.dtg.h",
+  "op-attrs/ff_dim_t.h",
+  "op-attrs/ff_dim_t.dtg.h",
 ]
 
 [[fields]]
diff --git a/lib/op-attrs/include/op-attrs/ops/concat_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/concat_attrs.struct.toml
index fab8132993..f3c66d0416 100644
--- a/lib/op-attrs/include/op-attrs/ops/concat_attrs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/concat_attrs.struct.toml
@@ -10,8 +10,8 @@ features = [
 ]
 
 includes = [
-  "op-attrs/ff_dim.h",
-  "op-attrs/ff_dim.dtg.h"
+  "op-attrs/ff_dim_t.h",
+  "op-attrs/ff_dim_t.dtg.h"
 ]
 
 [[fields]]
diff --git a/lib/op-attrs/include/op-attrs/ops/embedding_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/embedding_attrs.struct.toml
index 66d6f99253..b8d15284e9 100644
--- a/lib/op-attrs/include/op-attrs/ops/embedding_attrs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/embedding_attrs.struct.toml
@@ -10,7 +10,7 @@ features = [
 ]
 
 includes = [
-  "utils/stack_vector.h",
+  "utils/stack_vector/stack_vector.h",
   "op-attrs/aggregate_op.dtg.h",
   "op-attrs/datatype.dtg.h",
 ]
diff --git a/lib/op-attrs/include/op-attrs/ops/flat_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/flat_attrs.struct.toml
index 7349e2a8c4..301df8bca4 100644
--- a/lib/op-attrs/include/op-attrs/ops/flat_attrs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/flat_attrs.struct.toml
@@ -11,14 +11,14 @@ features = [
 
 includes = [
   "<optional>",
-  "op-attrs/ff_dim.dtg.h",
+  "op-attrs/ff_dim_t.dtg.h",
 ]
 
 src_includes = [
   "utils/fmt/optional.h",
   "utils/json/optional.h",
   "utils/rapidcheck/optional.h",
-  "op-attrs/ff_dim.h",
+  "op-attrs/ff_dim_t.h",
 ]
 
 [[fields]]
diff --git a/lib/op-attrs/include/op-attrs/ops/gather_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/gather_attrs.struct.toml
index c8bb88dcc7..66d475aa46 100644
--- a/lib/op-attrs/include/op-attrs/ops/gather_attrs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/gather_attrs.struct.toml
@@ -10,8 +10,8 @@ features = [
 ]
 
 includes = [
-  "op-attrs/ff_dim.h",
-  "op-attrs/ff_dim.dtg.h"
+  "op-attrs/ff_dim_t.h",
+  "op-attrs/ff_dim_t.dtg.h"
 ]
 
 [[fields]]
diff --git a/lib/op-attrs/include/op-attrs/ops/layer_norm_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/layer_norm_attrs.struct.toml
index ec60d39f7f..d2a539e140 100644
--- a/lib/op-attrs/include/op-attrs/ops/layer_norm_attrs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/layer_norm_attrs.struct.toml
@@ -10,9 +10,9 @@ features = [
 ]
 
 includes = [
-  "op-attrs/ff_dim.h",
-  "op-attrs/ff_dim.dtg.h",
-  "utils/stack_vector.h",
+  "op-attrs/ff_dim_t.h",
+  "op-attrs/ff_dim_t.dtg.h",
+  "utils/stack_vector/stack_vector.h",
 ]
 
 [[fields]]
diff --git a/lib/op-attrs/include/op-attrs/ops/reduce_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/reduce_attrs.struct.toml
index 717e7954e8..607bee3000 100644
--- a/lib/op-attrs/include/op-attrs/ops/reduce_attrs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/reduce_attrs.struct.toml
@@ -11,9 +11,9 @@ features = [
 
 includes = [
   "op-attrs/operator_type.dtg.h",
-  "op-attrs/ff_dim.h",
-  "op-attrs/ff_dim.dtg.h",
-  "utils/stack_vector.h",
+  "op-attrs/ff_dim_t.h",
+  "op-attrs/ff_dim_t.dtg.h",
+  "utils/stack_vector/stack_vector.h",
 ]
 
 [[fields]]
diff --git a/lib/op-attrs/include/op-attrs/ops/repartition_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/repartition_attrs.struct.toml
index 25a33c0c15..69c4b7580f 100644
--- a/lib/op-attrs/include/op-attrs/ops/repartition_attrs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/repartition_attrs.struct.toml
@@ -10,8 +10,8 @@ features = [
 ]
 
 includes = [
-  "op-attrs/ff_dim.h",
-  "op-attrs/ff_dim.dtg.h",
+  "op-attrs/ff_dim_t.h",
+  "op-attrs/ff_dim_t.dtg.h",
 ]
 
 [[fields]]
diff --git a/lib/op-attrs/include/op-attrs/ops/reverse_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/reverse_attrs.struct.toml
index 198346e5dd..2577ac1398 100644
--- a/lib/op-attrs/include/op-attrs/ops/reverse_attrs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/reverse_attrs.struct.toml
@@ -10,8 +10,8 @@ features = [
 ]
 
 includes = [
-  "op-attrs/ff_dim.h",
-  "op-attrs/ff_dim.dtg.h",
+  "op-attrs/ff_dim_t.h",
+  "op-attrs/ff_dim_t.dtg.h",
 ]
 
 [[fields]]
diff --git a/lib/op-attrs/include/op-attrs/ops/softmax_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/softmax_attrs.struct.toml
index 8b839c122a..49172f44b0 100644
--- a/lib/op-attrs/include/op-attrs/ops/softmax_attrs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/softmax_attrs.struct.toml
@@ -10,8 +10,8 @@ features = [
 ]
 
 includes = [
-  "op-attrs/ff_dim.h",
-  "op-attrs/ff_dim.dtg.h",
+  "op-attrs/ff_dim_t.h",
+  "op-attrs/ff_dim_t.dtg.h",
 ]
 
 [[fields]]
diff --git a/lib/op-attrs/include/op-attrs/ops/split_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/split_attrs.struct.toml
index 8cdf7728af..fce827f5c2 100644
--- a/lib/op-attrs/include/op-attrs/ops/split_attrs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/split_attrs.struct.toml
@@ -10,9 +10,9 @@ features = [
 ]
 
 includes = [
-  "utils/stack_vector.h",
-  "op-attrs/ff_dim.h",
-  "op-attrs/ff_dim.dtg.h",
+  "utils/stack_vector/stack_vector.h",
+  "op-attrs/ff_dim_t.h",
+  "op-attrs/ff_dim_t.dtg.h",
 ]
 
 [[fields]]
diff --git a/lib/op-attrs/include/op-attrs/ops/transpose_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/transpose_attrs.struct.toml
index 0dc30d9a79..b1c5f60382 100644
--- a/lib/op-attrs/include/op-attrs/ops/transpose_attrs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/transpose_attrs.struct.toml
@@ -10,8 +10,8 @@ features = [
 ]
 
 includes = [
-  "op-attrs/ff_dim.h",
-  "op-attrs/ff_dim.dtg.h",
+  "op-attrs/ff_dim_t.h",
+  "op-attrs/ff_dim_t.dtg.h",
   "op-attrs/dim_ordered/dim_ordered.h",
 ]
 
diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_dim_idx_t.variant.toml b/lib/op-attrs/include/op-attrs/parallel_tensor_dim_idx_t.variant.toml
index 9396cbcbe8..7e7356a5e7 100644
--- a/lib/op-attrs/include/op-attrs/parallel_tensor_dim_idx_t.variant.toml
+++ b/lib/op-attrs/include/op-attrs/parallel_tensor_dim_idx_t.variant.toml
@@ -9,7 +9,7 @@ features = [
 ]
 
 includes = [
-  "op-attrs/ff_dim.dtg.h",
+  "op-attrs/ff_dim_t.dtg.h",
   "op-attrs/replica_type.dtg.h",
 ]
 
diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h b/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h
index 1b8361abf6..6b88a7bda1 100644
--- a/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h
+++ b/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h
@@ -31,8 +31,9 @@ int total_replica_degree(ParallelTensorDims const &);
 int total_shard_degree(ParallelTensorDims const &);
 int total_parallel_degree(ParallelTensorDims const &);
 
-ShardParallelDim shard_dim_at_idx(ParallelTensorDims const &, ff_dim_t);
-ShardParallelDim &shard_dim_at_idx(ParallelTensorDims &, ff_dim_t);
+ShardParallelDim shard_dim_at_idx(ParallelTensorDims const &,
+                                  relative_ff_dim_t);
+ShardParallelDim &shard_dim_at_idx(ParallelTensorDims &, relative_ff_dim_t);
 
 bool is_valid(ParallelTensorDims const &);
 TensorDims get_piece_dims(ParallelTensorDims const &);
diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h b/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h
index 0759dc746e..0339b9b8a6 100644
--- a/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h
+++ b/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h
@@ -1,6 +1,7 @@
 #ifndef _OP_META_PARALLEL_TENSOR_SHAPE_H
 #define _OP_META_PARALLEL_TENSOR_SHAPE_H
 
+#include "op-attrs/ff_dim_t.h"
 #include "op-attrs/parallel_dim.h"
 #include "op-attrs/parallel_tensor_dim_degrees.dtg.h"
 #include "op-attrs/parallel_tensor_dim_idx_t.dtg.h"
@@ -12,13 +13,14 @@
 namespace FlexFlow {
 
 int num_shard_dims(ParallelTensorShape const &);
-ShardParallelDim shard_dim_at_idx(ParallelTensorShape const &, ff_dim_t);
-ShardParallelDim &shard_dim_at_idx(ParallelTensorShape &, ff_dim_t);
+ShardParallelDim shard_dim_at_idx(ParallelTensorShape const &,
+                                  relative_ff_dim_t);
+ShardParallelDim &shard_dim_at_idx(ParallelTensorShape &, relative_ff_dim_t);
 
 FFOrdered<int> ff_ordered_shard_degrees(ParallelTensorShape const &);
 
 std::optional<ShardParallelDim>
-    try_get_shard_dim_at_idx(ParallelTensorShape const &, ff_dim_t);
+    try_get_shard_dim_at_idx(ParallelTensorShape const &, relative_ff_dim_t);
 
 ParallelTensorDimDegrees get_parallel_degrees(ParallelTensorShape const &);
 
diff --git a/lib/op-attrs/include/op-attrs/relative_ff_dim_t.h b/lib/op-attrs/include/op-attrs/relative_ff_dim_t.h
new file mode 100644
index 0000000000..af51cc69be
--- /dev/null
+++ b/lib/op-attrs/include/op-attrs/relative_ff_dim_t.h
@@ -0,0 +1,20 @@
+#ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_RELATIVE_FF_DIM_T_H
+#define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_RELATIVE_FF_DIM_T_H
+
+#include "op-attrs/ff_dim_t.dtg.h"
+#include "op-attrs/relative_ff_dim_t.dtg.h"
+#include "rapidcheck.h"
+
+namespace FlexFlow {
+ff_dim_t ff_dim_t_from_relative_ff_dim_t(relative_ff_dim_t ff_dim,
+                                         int input_dim);
+} // namespace FlexFlow
+
+namespace rc {
+template <>
+struct Arbitrary<::FlexFlow::relative_ff_dim_t> {
+  static Gen<::FlexFlow::relative_ff_dim_t> arbitrary();
+};
+} // namespace rc
+
+#endif // _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_RELATIVE_FF_DIM_T_H
diff --git a/lib/op-attrs/include/op-attrs/ff_dim.struct.toml b/lib/op-attrs/include/op-attrs/relative_ff_dim_t.struct.toml
similarity index 82%
rename from lib/op-attrs/include/op-attrs/ff_dim.struct.toml
rename to lib/op-attrs/include/op-attrs/relative_ff_dim_t.struct.toml
index 441f9826ca..a93b649052 100644
--- a/lib/op-attrs/include/op-attrs/ff_dim.struct.toml
+++ b/lib/op-attrs/include/op-attrs/relative_ff_dim_t.struct.toml
@@ -1,5 +1,5 @@
 namespace = "FlexFlow"
-name = "ff_dim_t"
+name = "relative_ff_dim_t"
 
 features = [
   "eq",
diff --git a/lib/op-attrs/include/op-attrs/tensor_dims.h b/lib/op-attrs/include/op-attrs/tensor_dims.h
index ee44a39170..5e1503360b 100644
--- a/lib/op-attrs/include/op-attrs/tensor_dims.h
+++ b/lib/op-attrs/include/op-attrs/tensor_dims.h
@@ -9,8 +9,8 @@ namespace FlexFlow {
 FFOrdered<size_t> const &ff_ordered(TensorDims const &);
 
 size_t num_dims(TensorDims const &);
-size_t dim_at_idx(TensorDims const &, ff_dim_t);
-size_t &dim_at_idx(TensorDims &, ff_dim_t);
+size_t dim_at_idx(TensorDims const &, relative_ff_dim_t);
+size_t &dim_at_idx(TensorDims &, relative_ff_dim_t);
 
 bool tensor_dims_is_broadcastable_to(TensorDims const &curr,
                                      TensorDims const &goal);
diff --git a/lib/op-attrs/include/op-attrs/tensor_shape.h b/lib/op-attrs/include/op-attrs/tensor_shape.h
index 14ee637f92..b8733cddbe 100644
--- a/lib/op-attrs/include/op-attrs/tensor_shape.h
+++ b/lib/op-attrs/include/op-attrs/tensor_shape.h
@@ -6,8 +6,8 @@
 namespace FlexFlow {
 
 size_t num_dims(TensorShape const &);
-size_t dim_at_idx(TensorShape const &, ff_dim_t);
-size_t &dim_at_idx(TensorShape &, ff_dim_t);
+size_t dim_at_idx(TensorShape const &, relative_ff_dim_t);
+size_t &dim_at_idx(TensorShape &, relative_ff_dim_t);
 size_t get_num_elements(TensorShape const &);
 size_t get_size_in_bytes(TensorShape const &);
 
diff --git a/lib/op-attrs/src/op-attrs/dim_ordered/slice.cc b/lib/op-attrs/src/op-attrs/dim_ordered/slice.cc
new file mode 100644
index 0000000000..75ab1a32aa
--- /dev/null
+++ b/lib/op-attrs/src/op-attrs/dim_ordered/slice.cc
@@ -0,0 +1,26 @@
+#include "op-attrs/dim_ordered/slice.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using T = value_type<0>;
+
+template FFOrdered<T>
+    ff_dim_t_nonoverloaded_slice(FFOrdered<T> const &d,
+                                 std::optional<ff_dim_t> const &start,
+                                 std::optional<ff_dim_t> const &end);
+
+template FFOrdered<T> relative_ff_dim_t_nonoverloaded_slice(
+    FFOrdered<T> const &d,
+    std::optional<relative_ff_dim_t> const &start,
+    std::optional<relative_ff_dim_t> const &end);
+
+template FFOrdered<T> slice(FFOrdered<T> const &d,
+                            std::optional<ff_dim_t> const &start,
+                            std::optional<ff_dim_t> const &end);
+
+template FFOrdered<T> slice(FFOrdered<T> const &d,
+                            std::optional<relative_ff_dim_t> const &start,
+                            std::optional<relative_ff_dim_t> const &end);
+
+} // namespace FlexFlow
diff --git a/lib/op-attrs/src/op-attrs/ff_dim_t.cc b/lib/op-attrs/src/op-attrs/ff_dim_t.cc
new file mode 100644
index 0000000000..0a99e39a91
--- /dev/null
+++ b/lib/op-attrs/src/op-attrs/ff_dim_t.cc
@@ -0,0 +1,15 @@
+#include "op-attrs/ff_dim_t.h"
+
+namespace FlexFlow {
+relative_ff_dim_t relative_ff_dim_t_from_ff_dim_t(ff_dim_t ff_dim) {
+  return relative_ff_dim_t{ff_dim.value.get_value()};
+}
+} // namespace FlexFlow
+
+namespace rc {
+Gen<::FlexFlow::ff_dim_t> Arbitrary<::FlexFlow::ff_dim_t>::arbitrary() {
+  return gen::construct<::FlexFlow::ff_dim_t>(
+      gen::map(gen::inRange<int>(0, MAX_TENSOR_DIM),
+               [](int value) { return FlexFlow::nonnegative_int{value}; }));
+}
+} // namespace rc
diff --git a/lib/op-attrs/src/op-attrs/ops/attention.cc b/lib/op-attrs/src/op-attrs/ops/attention.cc
index 483d832fee..57c7105534 100644
--- a/lib/op-attrs/src/op-attrs/ops/attention.cc
+++ b/lib/op-attrs/src/op-attrs/ops/attention.cc
@@ -33,15 +33,15 @@ int get_oProjSize(MultiHeadAttentionAttrs const &attrs) {
 }
 
 int get_qSize(TensorShape const &query_shape) {
-  return dim_at_idx(query_shape, ff_dim_t(0));
+  return dim_at_idx(query_shape, relative_ff_dim_t{0});
 }
 
 int get_kSize(TensorShape const &key_shape) {
-  return dim_at_idx(key_shape, ff_dim_t(0));
+  return dim_at_idx(key_shape, relative_ff_dim_t{0});
 }
 
 int get_vSize(TensorShape const &value_shape) {
-  return dim_at_idx(value_shape, ff_dim_t(0));
+  return dim_at_idx(value_shape, relative_ff_dim_t{0});
 }
 
 int get_qSize(MultiHeadAttentionParallelInputs const &inputs) {
diff --git a/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_inputs.cc b/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_inputs.cc
index 65feb642e1..97544d1750 100644
--- a/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_inputs.cc
+++ b/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_inputs.cc
@@ -31,9 +31,9 @@ tl::expected<MultiHeadAttentionInputs, std::string>
                     3));
   }
 
-  size_t seq_len_q = dim_at_idx(input_q, ff_dim_t{-2});
-  size_t seq_len_k = dim_at_idx(input_k, ff_dim_t{-2});
-  size_t seq_len_v = dim_at_idx(input_v, ff_dim_t{-2});
+  size_t seq_len_q = dim_at_idx(input_q, relative_ff_dim_t{-2});
+  size_t seq_len_k = dim_at_idx(input_k, relative_ff_dim_t{-2});
+  size_t seq_len_v = dim_at_idx(input_v, relative_ff_dim_t{-2});
 
   if (!all_same(seq_len_q, seq_len_k, seq_len_v)) {
     return tl::unexpected(fmt::format(
@@ -43,9 +43,9 @@ tl::expected<MultiHeadAttentionInputs, std::string>
         seq_len_v));
   }
 
-  size_t batch_size_q = dim_at_idx(input_q, ff_dim_t{-3});
-  size_t batch_size_k = dim_at_idx(input_k, ff_dim_t{-3});
-  size_t batch_size_v = dim_at_idx(input_v, ff_dim_t{-3});
+  size_t batch_size_q = dim_at_idx(input_q, relative_ff_dim_t{-3});
+  size_t batch_size_k = dim_at_idx(input_k, relative_ff_dim_t{-3});
+  size_t batch_size_v = dim_at_idx(input_v, relative_ff_dim_t{-3});
 
   if (!all_same(batch_size_q, batch_size_k, batch_size_v)) {
     return tl::unexpected(fmt::format(
@@ -63,9 +63,9 @@ tl::expected<MultiHeadAttentionInputs, std::string>
         input_v.data_type));
   }
 
-  size_t q_size = dim_at_idx(input_q, ff_dim_t{-1});
-  size_t k_size = dim_at_idx(input_k, ff_dim_t{-1});
-  size_t v_size = dim_at_idx(input_v, ff_dim_t{-1});
+  size_t q_size = dim_at_idx(input_q, relative_ff_dim_t{-1});
+  size_t k_size = dim_at_idx(input_k, relative_ff_dim_t{-1});
+  size_t v_size = dim_at_idx(input_v, relative_ff_dim_t{-1});
 
   return MultiHeadAttentionInputs{
       batch_size_q,
diff --git a/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_parallel_inputs.cc b/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_parallel_inputs.cc
index b5ddeaac30..3bd0825555 100644
--- a/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_parallel_inputs.cc
+++ b/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_parallel_inputs.cc
@@ -42,7 +42,7 @@ tl::expected<MultiHeadAttentionParallelInputs, std::string>
                     3));
   }
 
-  ShardParallelDim seq_len_q = shard_dim_at_idx(input_q, ff_dim_t{-2});
+  ShardParallelDim seq_len_q = shard_dim_at_idx(input_q, relative_ff_dim_t{-2});
   if (seq_len_q.degree != 1) {
     return tl::unexpected(
         fmt::format("Query sequence length parallel degree expected to be 1, "
@@ -50,7 +50,7 @@ tl::expected<MultiHeadAttentionParallelInputs, std::string>
                     seq_len_q.degree));
   }
 
-  ShardParallelDim seq_len_k = shard_dim_at_idx(input_k, ff_dim_t{-2});
+  ShardParallelDim seq_len_k = shard_dim_at_idx(input_k, relative_ff_dim_t{-2});
   if (seq_len_k.degree != 1) {
     return tl::unexpected(
         fmt::format("Key sequence length parallel degree expected to be 1, but "
@@ -58,7 +58,7 @@ tl::expected<MultiHeadAttentionParallelInputs, std::string>
                     seq_len_k.degree));
   }
 
-  ShardParallelDim seq_len_v = shard_dim_at_idx(input_v, ff_dim_t{-2});
+  ShardParallelDim seq_len_v = shard_dim_at_idx(input_v, relative_ff_dim_t{-2});
   if (seq_len_v.degree != 1) {
     return tl::unexpected(
         fmt::format("Value sequence length parallel degree expected to be 1, "
@@ -66,9 +66,12 @@ tl::expected<MultiHeadAttentionParallelInputs, std::string>
                     seq_len_v.degree));
   }
 
-  ShardParallelDim batch_size_q = shard_dim_at_idx(input_q, ff_dim_t{-3});
-  ShardParallelDim batch_size_k = shard_dim_at_idx(input_k, ff_dim_t{-3});
-  ShardParallelDim batch_size_v = shard_dim_at_idx(input_v, ff_dim_t{-3});
+  ShardParallelDim batch_size_q =
+      shard_dim_at_idx(input_q, relative_ff_dim_t{-3});
+  ShardParallelDim batch_size_k =
+      shard_dim_at_idx(input_k, relative_ff_dim_t{-3});
+  ShardParallelDim batch_size_v =
+      shard_dim_at_idx(input_v, relative_ff_dim_t{-3});
 
   if (!all_same(
           batch_size_q.degree, batch_size_k.degree, batch_size_v.degree)) {
@@ -80,7 +83,7 @@ tl::expected<MultiHeadAttentionParallelInputs, std::string>
                     batch_size_v.degree));
   }
 
-  ShardParallelDim query_dim = shard_dim_at_idx(input_q, ff_dim_t{-1});
+  ShardParallelDim query_dim = shard_dim_at_idx(input_q, relative_ff_dim_t{-1});
   if (query_dim.degree > 1) {
     return tl::unexpected(
         fmt::format("Expected query tensor to have query dim parallel degree "
@@ -88,7 +91,7 @@ tl::expected<MultiHeadAttentionParallelInputs, std::string>
                     query_dim.degree));
   }
 
-  ShardParallelDim key_dim = shard_dim_at_idx(input_k, ff_dim_t{-1});
+  ShardParallelDim key_dim = shard_dim_at_idx(input_k, relative_ff_dim_t{-1});
   if (key_dim.degree > 1) {
     return tl::unexpected(
         fmt::format("Expected key tensor to have key dim parallel degree 1, "
@@ -96,7 +99,7 @@ tl::expected<MultiHeadAttentionParallelInputs, std::string>
                     key_dim.degree));
   }
 
-  ShardParallelDim value_dim = shard_dim_at_idx(input_v, ff_dim_t{-1});
+  ShardParallelDim value_dim = shard_dim_at_idx(input_v, relative_ff_dim_t{-1});
   if (value_dim.degree > 1) {
     return tl::unexpected(
         fmt::format("Expected value tensor to have value dim parallel degree "
diff --git a/lib/op-attrs/src/op-attrs/ops/batch_matmul.cc b/lib/op-attrs/src/op-attrs/ops/batch_matmul.cc
index f9836bd3ed..71118db7a6 100644
--- a/lib/op-attrs/src/op-attrs/ops/batch_matmul.cc
+++ b/lib/op-attrs/src/op-attrs/ops/batch_matmul.cc
@@ -57,13 +57,13 @@ tl::expected<TensorShape, std::string>
                                       input_rhs.data_type));
   }
 
-  size_t lhs_b = dim_at_idx(input_lhs, ff_dim_t{0});
-  size_t n = dim_at_idx(input_lhs, ff_dim_t{1});
-  size_t lhs_m = dim_at_idx(input_lhs, ff_dim_t{2});
+  size_t lhs_b = dim_at_idx(input_lhs, relative_ff_dim_t{0});
+  size_t n = dim_at_idx(input_lhs, relative_ff_dim_t{1});
+  size_t lhs_m = dim_at_idx(input_lhs, relative_ff_dim_t{2});
 
-  size_t rhs_b = dim_at_idx(input_rhs, ff_dim_t{0});
-  size_t rhs_m = dim_at_idx(input_rhs, ff_dim_t{1});
-  size_t p = dim_at_idx(input_rhs, ff_dim_t{2});
+  size_t rhs_b = dim_at_idx(input_rhs, relative_ff_dim_t{0});
+  size_t rhs_m = dim_at_idx(input_rhs, relative_ff_dim_t{1});
+  size_t p = dim_at_idx(input_rhs, relative_ff_dim_t{2});
 
   if (lhs_b != rhs_b) {
     return tl::unexpected(
@@ -111,13 +111,13 @@ tl::expected<ParallelTensorShape, std::string>
   assert(get_total_parallel_degree(input_lhs) ==
          get_total_parallel_degree(input_rhs));
 
-  ShardParallelDim lhs_b = shard_dim_at_idx(input_lhs, ff_dim_t{0});
-  ShardParallelDim n = shard_dim_at_idx(input_lhs, ff_dim_t{1});
-  ShardParallelDim lhs_m = shard_dim_at_idx(input_lhs, ff_dim_t{2});
+  ShardParallelDim lhs_b = shard_dim_at_idx(input_lhs, relative_ff_dim_t{0});
+  ShardParallelDim n = shard_dim_at_idx(input_lhs, relative_ff_dim_t{1});
+  ShardParallelDim lhs_m = shard_dim_at_idx(input_lhs, relative_ff_dim_t{2});
 
-  ShardParallelDim rhs_b = shard_dim_at_idx(input_rhs, ff_dim_t{0});
-  ShardParallelDim rhs_m = shard_dim_at_idx(input_rhs, ff_dim_t{1});
-  ShardParallelDim p = shard_dim_at_idx(input_rhs, ff_dim_t{2});
+  ShardParallelDim rhs_b = shard_dim_at_idx(input_rhs, relative_ff_dim_t{0});
+  ShardParallelDim rhs_m = shard_dim_at_idx(input_rhs, relative_ff_dim_t{1});
+  ShardParallelDim p = shard_dim_at_idx(input_rhs, relative_ff_dim_t{2});
 
   if (lhs_b != rhs_b) {
     return tl::unexpected(
diff --git a/lib/op-attrs/src/op-attrs/ops/batch_norm.cc b/lib/op-attrs/src/op-attrs/ops/batch_norm.cc
index f394bb8473..472e5f1a25 100644
--- a/lib/op-attrs/src/op-attrs/ops/batch_norm.cc
+++ b/lib/op-attrs/src/op-attrs/ops/batch_norm.cc
@@ -67,7 +67,7 @@ tl::expected<TensorShape, std::string>
     return tl::unexpected("No gamma weights exist for attrs.affine = false");
   }
 
-  size_t num_channels = dim_at_idx(input_shape, ff_dim_t{1});
+  size_t num_channels = dim_at_idx(input_shape, relative_ff_dim_t{1});
 
   return TensorShape{
       TensorDims{FFOrdered<size_t>{
@@ -109,8 +109,12 @@ static std::optional<std::string>
   }
 
   FFOrdered<int> non_channel_degrees =
-      concat(slice(input_degrees.shard_degrees, ff_dim_t{0}, ff_dim_t{1}),
-             slice(input_degrees.shard_degrees, ff_dim_t{2}, std::nullopt));
+      concat(slice(input_degrees.shard_degrees,
+                   ff_dim_t{nonnegative_int{0}},
+                   ff_dim_t{nonnegative_int{1}}),
+             slice(input_degrees.shard_degrees,
+                   ff_dim_t{nonnegative_int{2}},
+                   std::nullopt));
 
   if (any_of(non_channel_degrees, [](int degree) { return degree != 1; })) {
     return fmt::format("Expected parallel degree of all non-channel dimensions "
@@ -152,7 +156,7 @@ tl::expected<ParallelTensorDimDegrees, std::string>
     return tl::unexpected("No gamma weights exist for attrs.affine = false");
   }
 
-  ff_dim_t channel_dim = ff_dim_t{1};
+  relative_ff_dim_t channel_dim = relative_ff_dim_t{1};
 
   return ParallelTensorDimDegrees{
       SumDegree{1},
diff --git a/lib/op-attrs/src/op-attrs/ops/broadcast.cc b/lib/op-attrs/src/op-attrs/ops/broadcast.cc
index aa3c95f551..31e241e27b 100644
--- a/lib/op-attrs/src/op-attrs/ops/broadcast.cc
+++ b/lib/op-attrs/src/op-attrs/ops/broadcast.cc
@@ -15,7 +15,7 @@ RecordFormatter as_dot(BroadcastAttrs const &attrs) {
 
   for (int i = 0; i < num_dims(attrs.target_dims); i++) {
     r << kv(fmt::format("target_dims[{}]", i),
-            dim_at_idx(attrs.target_dims, ff_dim_t{i}));
+            dim_at_idx(attrs.target_dims, relative_ff_dim_t{i}));
   }
 
   return r;
diff --git a/lib/op-attrs/src/op-attrs/ops/combine.cc b/lib/op-attrs/src/op-attrs/ops/combine.cc
index e41b78c5af..4c51efe9b0 100644
--- a/lib/op-attrs/src/op-attrs/ops/combine.cc
+++ b/lib/op-attrs/src/op-attrs/ops/combine.cc
@@ -1,4 +1,5 @@
 #include "op-attrs/ops/combine.h"
+#include "op-attrs/ff_dim_t.h"
 #include "op-attrs/parallel_tensor_shape.h"
 
 namespace FlexFlow {
@@ -7,8 +8,8 @@ tl::expected<ParallelTensorShape, std::string>
     get_output_shape(CombineAttrs const &attrs,
                      ParallelTensorShape const &input) {
   ShardParallelDim input_dim = ({
-    std::optional<ShardParallelDim> result =
-        try_get_shard_dim_at_idx(input, attrs.combine_dim);
+    std::optional<ShardParallelDim> result = try_get_shard_dim_at_idx(
+        input, relative_ff_dim_t_from_ff_dim_t(attrs.combine_dim));
     if (!result.has_value()) {
       return tl::unexpected(fmt::format(
           "Failed to get shard dim at index {} in parallel tensor shape {}",
@@ -29,7 +30,8 @@ tl::expected<ParallelTensorShape, std::string>
   }
 
   ParallelTensorShape output = input;
-  shard_dim_at_idx(output, attrs.combine_dim).degree /= attrs.combine_degree;
+  shard_dim_at_idx(output, relative_ff_dim_t_from_ff_dim_t(attrs.combine_dim))
+      .degree /= attrs.combine_degree;
 
   return output;
 }
diff --git a/lib/op-attrs/src/op-attrs/ops/concat.cc b/lib/op-attrs/src/op-attrs/ops/concat.cc
index 74295f279e..3019151236 100644
--- a/lib/op-attrs/src/op-attrs/ops/concat.cc
+++ b/lib/op-attrs/src/op-attrs/ops/concat.cc
@@ -49,8 +49,10 @@ tl::expected<TensorShape, std::string>
     returned.value();
   });
 
-  std::vector<size_t> axis_dim_sizes = transform(
-      inputs, [&](TensorShape const &s) { return dim_at_idx(s, attrs.axis); });
+  std::vector<size_t> axis_dim_sizes =
+      transform(inputs, [&](TensorShape const &s) {
+        return dim_at_idx(s, relative_ff_dim_t_from_ff_dim_t(attrs.axis));
+      });
 
   size_t output_axis_dim_size = sum(axis_dim_sizes);
 
@@ -104,7 +106,8 @@ tl::expected<ParallelTensorShape, std::string>
   });
 
   if (!all_of(inputs, [&](ParallelTensorShape const &s) {
-        return shard_dim_at_idx(s, attrs.axis).degree == 1;
+        return shard_dim_at_idx(s, relative_ff_dim_t_from_ff_dim_t(attrs.axis))
+                   .degree == 1;
       })) {
     return tl::unexpected(fmt::format(
         "get_output_shape for Concat expected input tensors to have parallel "
diff --git a/lib/op-attrs/src/op-attrs/ops/conv_2d/conv_2d_input_shape.cc b/lib/op-attrs/src/op-attrs/ops/conv_2d/conv_2d_input_shape.cc
index a8a3b10bdf..aad067feb2 100644
--- a/lib/op-attrs/src/op-attrs/ops/conv_2d/conv_2d_input_shape.cc
+++ b/lib/op-attrs/src/op-attrs/ops/conv_2d/conv_2d_input_shape.cc
@@ -6,10 +6,10 @@ namespace FlexFlow {
 Conv2DInputShape parse_input_shape(TensorShape const &input) {
   assert(num_dims(input) == 4);
 
-  size_t num_samples = dim_at_idx(input, ff_dim_t{0});
-  size_t in_channels = dim_at_idx(input, ff_dim_t{1});
-  size_t in_height = dim_at_idx(input, ff_dim_t{2});
-  size_t in_width = dim_at_idx(input, ff_dim_t{3});
+  size_t num_samples = dim_at_idx(input, relative_ff_dim_t{0});
+  size_t in_channels = dim_at_idx(input, relative_ff_dim_t{1});
+  size_t in_height = dim_at_idx(input, relative_ff_dim_t{2});
+  size_t in_width = dim_at_idx(input, relative_ff_dim_t{3});
 
   return Conv2DInputShape{
       num_samples,
diff --git a/lib/op-attrs/src/op-attrs/ops/conv_2d/conv_2d_parallel_input_shape.cc b/lib/op-attrs/src/op-attrs/ops/conv_2d/conv_2d_parallel_input_shape.cc
index 98f69d14c9..8143353b2d 100644
--- a/lib/op-attrs/src/op-attrs/ops/conv_2d/conv_2d_parallel_input_shape.cc
+++ b/lib/op-attrs/src/op-attrs/ops/conv_2d/conv_2d_parallel_input_shape.cc
@@ -7,10 +7,10 @@ Conv2DParallelInputShape
     parse_parallel_input_shape(ParallelTensorShape const &input) {
   assert(num_shard_dims(input) == 4);
 
-  ShardParallelDim sample_dim = shard_dim_at_idx(input, ff_dim_t{0});
-  ShardParallelDim channel_dim = shard_dim_at_idx(input, ff_dim_t{1});
-  ShardParallelDim height_dim = shard_dim_at_idx(input, ff_dim_t{2});
-  ShardParallelDim width_dim = shard_dim_at_idx(input, ff_dim_t{3});
+  ShardParallelDim sample_dim = shard_dim_at_idx(input, relative_ff_dim_t{0});
+  ShardParallelDim channel_dim = shard_dim_at_idx(input, relative_ff_dim_t{1});
+  ShardParallelDim height_dim = shard_dim_at_idx(input, relative_ff_dim_t{2});
+  ShardParallelDim width_dim = shard_dim_at_idx(input, relative_ff_dim_t{3});
 
   Conv2DParallelInputShape parsed = Conv2DParallelInputShape{
       sample_dim,
diff --git a/lib/op-attrs/src/op-attrs/ops/embedding.cc b/lib/op-attrs/src/op-attrs/ops/embedding.cc
index d10d52c6f5..fe557695da 100644
--- a/lib/op-attrs/src/op-attrs/ops/embedding.cc
+++ b/lib/op-attrs/src/op-attrs/ops/embedding.cc
@@ -34,7 +34,7 @@ tl::expected<TensorShape, std::string>
   }
 
   TensorShape output = input;
-  dim_at_idx(output, ff_dim_t{-1}) = attrs.out_channels;
+  dim_at_idx(output, relative_ff_dim_t{-1}) = attrs.out_channels;
   output.data_type = attrs.data_type;
   return output;
 }
@@ -73,12 +73,12 @@ tl::expected<ParallelTensorShape, std::string>
   });
 
   SumDegree sum_degree =
-      SumDegree{shard_dim_at_idx(input, ff_dim_t{-1}).degree};
+      SumDegree{shard_dim_at_idx(input, relative_ff_dim_t{-1}).degree};
   DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{1};
   FFOrdered<int> shard_degrees =
       transform(input.dims.shard_dims,
                 [](ShardParallelDim const &d) { return d.degree; });
-  shard_degrees.at(ff_dim_t{-1}) = get_discard_copy_degree(input);
+  shard_degrees.at(relative_ff_dim_t{-1}) = get_discard_copy_degree(input);
 
   return lift_to_parallel_with_degrees(
       unpar, sum_degree, discard_copy_degree, shard_degrees);
diff --git a/lib/op-attrs/src/op-attrs/ops/flat.cc b/lib/op-attrs/src/op-attrs/ops/flat.cc
index e9833d5e3f..bc86102566 100644
--- a/lib/op-attrs/src/op-attrs/ops/flat.cc
+++ b/lib/op-attrs/src/op-attrs/ops/flat.cc
@@ -11,8 +11,9 @@ namespace FlexFlow {
 
 TensorShape get_output_shape(FlatAttrs const &attrs,
                              TensorShape const &input_shape) {
-  FFOrdered<size_t> leading_dims =
-      slice(ff_ordered(input_shape.dims), ff_dim_t{0}, attrs.start_dim);
+  FFOrdered<size_t> leading_dims = slice(ff_ordered(input_shape.dims),
+                                         ff_dim_t{nonnegative_int{0}},
+                                         attrs.start_dim);
   FFOrdered<size_t> flattened_dims =
       slice(ff_ordered(input_shape.dims), attrs.start_dim, attrs.end_dim);
   FFOrdered<size_t> trailing_dims =
@@ -57,7 +58,9 @@ tl::expected<ParallelTensorDimDegrees, std::string>
       /*discard_copy_degree=*/input_degrees.discard_copy_degree,
       /*shard_degrees=*/
       concat(std::vector{
-          slice(input_degrees.shard_degrees, ff_dim_t{0}, attrs.start_dim),
+          slice(input_degrees.shard_degrees,
+                ff_dim_t{nonnegative_int{0}},
+                attrs.start_dim),
           {product(flattened_dim_degrees)},
           slice(input_degrees.shard_degrees, attrs.end_dim, std::nullopt),
       }),
diff --git a/lib/op-attrs/src/op-attrs/ops/layer_norm.cc b/lib/op-attrs/src/op-attrs/ops/layer_norm.cc
index 0dd9ac7a17..86426dd18f 100644
--- a/lib/op-attrs/src/op-attrs/ops/layer_norm.cc
+++ b/lib/op-attrs/src/op-attrs/ops/layer_norm.cc
@@ -73,7 +73,8 @@ tl::expected<TensorShape, std::string>
       [&](ff_dim_t const &dim_idx) { return !contains(attrs.axes, dim_idx); });
   std::vector<size_t> raw_weight_dims =
       transform(non_layer_norm_dim_idxs, [&](ff_dim_t const &dim_idx) {
-        return dim_at_idx(input_shape, dim_idx);
+        return dim_at_idx(input_shape,
+                          relative_ff_dim_t_from_ff_dim_t(dim_idx));
       });
 
   return TensorShape{
@@ -117,7 +118,9 @@ static std::optional<std::string>
   }
 
   if (!all_of(attrs.axes, [&](ff_dim_t axis) {
-        return shard_dim_at_idx(input_shape, axis).degree == 1;
+        return shard_dim_at_idx(input_shape,
+                                relative_ff_dim_t_from_ff_dim_t(axis))
+                   .degree == 1;
       })) {
     return fmt::format("Expected parallel degree of all dimensions in "
                        "LayerNorm axes {} to be 1, but received input shape {}",
@@ -163,7 +166,8 @@ tl::expected<ParallelTensorShape, std::string>
       [&](ff_dim_t const &dim_idx) { return !contains(attrs.axes, dim_idx); });
   std::vector<ShardParallelDim> raw_weight_shard_dims =
       transform(non_layer_norm_dim_idxs, [&](ff_dim_t const &dim_idx) {
-        return shard_dim_at_idx(input_shape, dim_idx);
+        return shard_dim_at_idx(input_shape,
+                                relative_ff_dim_t_from_ff_dim_t(dim_idx));
       });
 
   return ParallelTensorShape{
diff --git a/lib/op-attrs/src/op-attrs/ops/linear.cc b/lib/op-attrs/src/op-attrs/ops/linear.cc
index feac647216..e00a47d490 100644
--- a/lib/op-attrs/src/op-attrs/ops/linear.cc
+++ b/lib/op-attrs/src/op-attrs/ops/linear.cc
@@ -41,7 +41,7 @@ RecordFormatter as_dot(LinearAttrs const &attrs) {
 tl::expected<TensorShape, std::string>
     get_projection_shape(LinearAttrs const &attrs,
                          TensorShape const &input_shape) {
-  size_t in_channels = dim_at_idx(input_shape, ff_dim_t{-1});
+  size_t in_channels = dim_at_idx(input_shape, relative_ff_dim_t{-1});
 
   return TensorShape{
       TensorDims{
@@ -64,7 +64,7 @@ tl::expected<TensorShape, std::string>
 tl::expected<TensorShape, std::string>
     get_output_shape(LinearAttrs const &attrs, TensorShape const &input_shape) {
   TensorShape output_shape = input_shape;
-  output_shape.dims.ff_ordered.at(ff_dim_t{-1}) =
+  output_shape.dims.ff_ordered.at(relative_ff_dim_t{-1}) =
       size_t_from_int(attrs.out_channels);
 
   return output_shape;
@@ -84,11 +84,11 @@ tl::expected<ParallelTensorShape, std::string>
 
   SumDegree sum_degree = SumDegree{1};
   DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{
-      get_sum_degree(input) *
-      product(
-          slice(ff_ordered_shard_degrees(input), std::nullopt, ff_dim_t{-1}))};
+      get_sum_degree(input) * product(slice(ff_ordered_shard_degrees(input),
+                                            std::nullopt,
+                                            relative_ff_dim_t{-1}))};
   FFOrdered<int> shard_degrees = FFOrdered<int>{
-      shard_dim_at_idx(input, ff_dim_t{-1}).degree,
+      shard_dim_at_idx(input, relative_ff_dim_t{-1}).degree,
       get_discard_copy_degree(input),
   };
 
@@ -107,10 +107,11 @@ tl::expected<ParallelTensorShape, std::string>
     result_unpar.value();
   });
 
-  SumDegree sum_degree = SumDegree{
-      get_sum_degree(input) * shard_dim_at_idx(input, ff_dim_t{-1}).degree};
-  DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{product(
-      slice(ff_ordered_shard_degrees(input), std::nullopt, ff_dim_t{-1}))};
+  SumDegree sum_degree =
+      SumDegree{get_sum_degree(input) *
+                shard_dim_at_idx(input, relative_ff_dim_t{-1}).degree};
+  DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{product(slice(
+      ff_ordered_shard_degrees(input), std::nullopt, relative_ff_dim_t{-1}))};
   FFOrdered<int> shard_degrees = FFOrdered<int>{get_discard_copy_degree(input)};
 
   return lift_to_parallel_with_degrees(
@@ -129,11 +130,12 @@ tl::expected<ParallelTensorShape, std::string>
     result_unpar.value();
   });
 
-  SumDegree sum_degree = SumDegree{
-      get_sum_degree(input) * shard_dim_at_idx(input, ff_dim_t{-1}).degree};
+  SumDegree sum_degree =
+      SumDegree{get_sum_degree(input) *
+                shard_dim_at_idx(input, relative_ff_dim_t{-1}).degree};
   DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{1};
   FFOrdered<int> shard_degrees = ff_ordered_shard_degrees(input);
-  shard_degrees.at(ff_dim_t{-1}) = get_discard_copy_degree(input);
+  shard_degrees.at(relative_ff_dim_t{-1}) = get_discard_copy_degree(input);
 
   return lift_to_parallel_with_degrees(
       unpar, sum_degree, discard_copy_degree, shard_degrees);
diff --git a/lib/op-attrs/src/op-attrs/ops/pool_2d.cc b/lib/op-attrs/src/op-attrs/ops/pool_2d.cc
index 95bcd8b336..86d287ebc8 100644
--- a/lib/op-attrs/src/op-attrs/ops/pool_2d.cc
+++ b/lib/op-attrs/src/op-attrs/ops/pool_2d.cc
@@ -22,10 +22,10 @@ tl::expected<Pool2DAttrs, std::string>
                     input_dims));
   }
 
-  size_t num_samples = dim_at_idx(input_dims, ff_dim_t{0});
-  size_t num_channels = dim_at_idx(input_dims, ff_dim_t{1});
-  size_t input_h = dim_at_idx(input_dims, ff_dim_t{2});
-  size_t input_w = dim_at_idx(input_dims, ff_dim_t{3});
+  size_t num_samples = dim_at_idx(input_dims, relative_ff_dim_t{0});
+  size_t num_channels = dim_at_idx(input_dims, relative_ff_dim_t{1});
+  size_t input_h = dim_at_idx(input_dims, relative_ff_dim_t{2});
+  size_t input_w = dim_at_idx(input_dims, relative_ff_dim_t{3});
 
   if (input_h % output_h != 0) {
     return tl::unexpected(fmt::format(
@@ -113,10 +113,10 @@ tl::expected<TensorShape, std::string>
                     input_shape));
   }
 
-  size_t num_samples = dim_at_idx(input_shape, ff_dim_t{0});
-  size_t num_channels = dim_at_idx(input_shape, ff_dim_t{1});
-  size_t input_height = dim_at_idx(input_shape, ff_dim_t{2});
-  size_t input_width = dim_at_idx(input_shape, ff_dim_t{3});
+  size_t num_samples = dim_at_idx(input_shape, relative_ff_dim_t{0});
+  size_t num_channels = dim_at_idx(input_shape, relative_ff_dim_t{1});
+  size_t input_height = dim_at_idx(input_shape, relative_ff_dim_t{2});
+  size_t input_width = dim_at_idx(input_shape, relative_ff_dim_t{3});
 
   size_t output_height =
       (input_height + 2 * attrs.padding_h - attrs.kernel_h) / attrs.stride_h +
diff --git a/lib/op-attrs/src/op-attrs/ops/repartition.cc b/lib/op-attrs/src/op-attrs/ops/repartition.cc
index 37a0b8a168..5bda589eb3 100644
--- a/lib/op-attrs/src/op-attrs/ops/repartition.cc
+++ b/lib/op-attrs/src/op-attrs/ops/repartition.cc
@@ -6,8 +6,9 @@ tl::expected<ParallelTensorShape, std::string>
     get_output_shape(RepartitionAttrs const &attrs,
                      ParallelTensorShape const &input_shape) {
   ParallelTensorShape output_shape = input_shape;
-  output_shape.dims.shard_dims.at(attrs.repartition_dim).degree *=
-      attrs.repartition_degree;
+  output_shape.dims.shard_dims
+      .at(relative_ff_dim_t_from_ff_dim_t(attrs.repartition_dim))
+      .degree *= attrs.repartition_degree;
   return output_shape;
 }
 
diff --git a/lib/op-attrs/src/op-attrs/ops/softmax.cc b/lib/op-attrs/src/op-attrs/ops/softmax.cc
index 541c590cbd..0d55a2ec2c 100644
--- a/lib/op-attrs/src/op-attrs/ops/softmax.cc
+++ b/lib/op-attrs/src/op-attrs/ops/softmax.cc
@@ -39,7 +39,8 @@ tl::expected<ParallelTensorShape, std::string>
         get_discard_copy_degree(input_shape)));
   }
 
-  if (shard_dim_at_idx(input_shape, attrs.dim).degree != 1) {
+  if (shard_dim_at_idx(input_shape, relative_ff_dim_t_from_ff_dim_t(attrs.dim))
+          .degree != 1) {
     return tl::unexpected(
         fmt::format("Expected parallel degree of Softmax dimension {} to be 1, "
                     "but received input shape {}",
diff --git a/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc b/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc
index 2955545561..0bb940924a 100644
--- a/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc
+++ b/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc
@@ -96,11 +96,13 @@ bool is_valid(ParallelTensorDims const &dims) {
                 [](ReplicaParallelDim const &d) { return is_valid(d); });
 }
 
-ShardParallelDim shard_dim_at_idx(ParallelTensorDims const &d, ff_dim_t idx) {
+ShardParallelDim shard_dim_at_idx(ParallelTensorDims const &d,
+                                  relative_ff_dim_t idx) {
   return d.shard_dims.at(idx);
 }
 
-ShardParallelDim &shard_dim_at_idx(ParallelTensorDims &d, ff_dim_t idx) {
+ShardParallelDim &shard_dim_at_idx(ParallelTensorDims &d,
+                                   relative_ff_dim_t idx) {
   return d.shard_dims.at(idx);
 }
 
diff --git a/lib/op-attrs/src/op-attrs/parallel_tensor_shape.cc b/lib/op-attrs/src/op-attrs/parallel_tensor_shape.cc
index dcc567e0ca..bbad13b46b 100644
--- a/lib/op-attrs/src/op-attrs/parallel_tensor_shape.cc
+++ b/lib/op-attrs/src/op-attrs/parallel_tensor_shape.cc
@@ -41,11 +41,13 @@ bool is_valid(ParallelTensorShape const &shape) {
   return is_valid(shape.dims);
 }
 
-ShardParallelDim shard_dim_at_idx(ParallelTensorShape const &s, ff_dim_t d) {
+ShardParallelDim shard_dim_at_idx(ParallelTensorShape const &s,
+                                  relative_ff_dim_t d) {
   return shard_dim_at_idx(s.dims, d);
 }
 
-ShardParallelDim &shard_dim_at_idx(ParallelTensorShape &s, ff_dim_t d) {
+ShardParallelDim &shard_dim_at_idx(ParallelTensorShape &s,
+                                   relative_ff_dim_t d) {
   return shard_dim_at_idx(s.dims, d);
 }
 
@@ -54,7 +56,8 @@ FFOrdered<int> ff_ordered_shard_degrees(ParallelTensorShape const &s) {
 }
 
 std::optional<ShardParallelDim>
-    try_get_shard_dim_at_idx(ParallelTensorShape const &s, ff_dim_t d) {
+    try_get_shard_dim_at_idx(ParallelTensorShape const &s,
+                             relative_ff_dim_t d) {
   if (s.dims.shard_dims.idx_is_valid(d)) {
     return s.dims.shard_dims.at(d);
   } else {
@@ -138,10 +141,10 @@ std::unordered_set<parallel_tensor_dim_idx_t>
     get_parallel_tensor_dim_indices(ParallelTensorShape const &shape) {
   std::unordered_set<parallel_tensor_dim_idx_t> indices;
   extend(indices, transform(range(num_shard_dims(shape.dims)), [](int idx) {
-           return parallel_tensor_dim_idx_t(ff_dim_t(idx));
+           return parallel_tensor_dim_idx_t{ff_dim_t{nonnegative_int{idx}}};
          }));
-  indices.insert(parallel_tensor_dim_idx_t(ReplicaType::SUM));
-  indices.insert(parallel_tensor_dim_idx_t(ReplicaType::DISCARD_COPY));
+  indices.insert(parallel_tensor_dim_idx_t{ReplicaType::SUM});
+  indices.insert(parallel_tensor_dim_idx_t{ReplicaType::DISCARD_COPY});
   return indices;
 }
 
diff --git a/lib/op-attrs/src/op-attrs/relative_ff_dim_t.cc b/lib/op-attrs/src/op-attrs/relative_ff_dim_t.cc
new file mode 100644
index 0000000000..0671bb05f2
--- /dev/null
+++ b/lib/op-attrs/src/op-attrs/relative_ff_dim_t.cc
@@ -0,0 +1,21 @@
+#include "op-attrs/relative_ff_dim_t.h"
+#include "rapidcheck.h"
+
+namespace FlexFlow {
+ff_dim_t ff_dim_t_from_relative_ff_dim_t(relative_ff_dim_t ff_dim,
+                                         int input_dim) {
+  int raw = ff_dim.value;
+  if (raw < 0) {
+    raw = input_dim + raw;
+  }
+  return ff_dim_t{nonnegative_int{raw}};
+}
+} // namespace FlexFlow
+
+namespace rc {
+Gen<::FlexFlow::relative_ff_dim_t>
+    Arbitrary<::FlexFlow::relative_ff_dim_t>::arbitrary() {
+  return gen::construct<::FlexFlow::relative_ff_dim_t>(
+      gen::inRange<int>(-MAX_TENSOR_DIM, MAX_TENSOR_DIM));
+}
+} // namespace rc
diff --git a/lib/op-attrs/src/op-attrs/tensor_dims.cc b/lib/op-attrs/src/op-attrs/tensor_dims.cc
index 1bb050db52..f0ac88d8e4 100644
--- a/lib/op-attrs/src/op-attrs/tensor_dims.cc
+++ b/lib/op-attrs/src/op-attrs/tensor_dims.cc
@@ -19,11 +19,11 @@ size_t num_dims(TensorDims const &dims) {
   return dims.ff_ordered.size();
 }
 
-size_t dim_at_idx(TensorDims const &dims, ff_dim_t idx) {
+size_t dim_at_idx(TensorDims const &dims, relative_ff_dim_t idx) {
   return dims.ff_ordered.at(idx);
 }
 
-size_t &dim_at_idx(TensorDims &dims, ff_dim_t idx) {
+size_t &dim_at_idx(TensorDims &dims, relative_ff_dim_t idx) {
   return dims.ff_ordered.at(idx);
 }
 
diff --git a/lib/op-attrs/src/op-attrs/tensor_shape.cc b/lib/op-attrs/src/op-attrs/tensor_shape.cc
index 07508e3065..70ed58aac6 100644
--- a/lib/op-attrs/src/op-attrs/tensor_shape.cc
+++ b/lib/op-attrs/src/op-attrs/tensor_shape.cc
@@ -11,11 +11,11 @@ size_t num_dims(TensorShape const &s) {
   return s.dims.ff_ordered.size();
 }
 
-size_t dim_at_idx(TensorShape const &s, ff_dim_t idx) {
+size_t dim_at_idx(TensorShape const &s, relative_ff_dim_t idx) {
   return dim_at_idx(s.dims, idx);
 }
 
-size_t &dim_at_idx(TensorShape &s, ff_dim_t idx) {
+size_t &dim_at_idx(TensorShape &s, relative_ff_dim_t idx) {
   return dim_at_idx(s.dims, idx);
 }
 
diff --git a/lib/op-attrs/test/src/op-attrs/dim_ordered/enumerate.cc b/lib/op-attrs/test/src/op-attrs/dim_ordered/enumerate.cc
index 180bc2a01f..bf4c33d65a 100644
--- a/lib/op-attrs/test/src/op-attrs/dim_ordered/enumerate.cc
+++ b/lib/op-attrs/test/src/op-attrs/dim_ordered/enumerate.cc
@@ -10,9 +10,9 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     std::map<ff_dim_t, std::string> result = enumerate(input);
     std::map<ff_dim_t, std::string> correct = {
-        {ff_dim_t{0}, "zero"},
-        {ff_dim_t{1}, "one"},
-        {ff_dim_t{2}, "two"},
+        {ff_dim_t{nonnegative_int{0}}, "zero"},
+        {ff_dim_t{nonnegative_int{1}}, "one"},
+        {ff_dim_t{nonnegative_int{2}}, "two"},
     };
 
     CHECK(result == correct);
diff --git a/lib/op-attrs/test/src/op-attrs/dim_ordered/ff_ordered_from_map.cc b/lib/op-attrs/test/src/op-attrs/dim_ordered/ff_ordered_from_map.cc
index 7bc1695e5c..bba989920e 100644
--- a/lib/op-attrs/test/src/op-attrs/dim_ordered/ff_ordered_from_map.cc
+++ b/lib/op-attrs/test/src/op-attrs/dim_ordered/ff_ordered_from_map.cc
@@ -20,9 +20,9 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("input is missing keys") {
       SUBCASE("missing key is in middle") {
         T m = {
-            {ff_dim_t{0}, 4},
-            {ff_dim_t{1}, 2},
-            {ff_dim_t{3}, 5},
+            {ff_dim_t{nonnegative_int{0}}, 4},
+            {ff_dim_t{nonnegative_int{1}}, 2},
+            {ff_dim_t{nonnegative_int{3}}, 5},
         };
 
         CHECK_THROWS(ff_ordered_from_map(m));
@@ -30,31 +30,21 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       SUBCASE("missing key is 0 idx") {
         T m = {
-            {ff_dim_t{1}, 2},
-            {ff_dim_t{2}, 7},
-            {ff_dim_t{3}, 5},
+            {ff_dim_t{nonnegative_int{1}}, 2},
+            {ff_dim_t{nonnegative_int{2}}, 7},
+            {ff_dim_t{nonnegative_int{3}}, 5},
         };
 
         CHECK_THROWS(ff_ordered_from_map(m));
       }
     }
 
-    SUBCASE("input has negative keys") {
-      T m = {
-          {ff_dim_t{0}, 4},
-          {ff_dim_t{1}, 5},
-          {ff_dim_t{-1}, 2},
-      };
-
-      CHECK_THROWS(ff_ordered_from_map(m));
-    }
-
     SUBCASE("input is valid") {
       T m = {
-          {ff_dim_t{0}, 4},
-          {ff_dim_t{1}, 5},
-          {ff_dim_t{2}, 2},
-          {ff_dim_t{3}, 7},
+          {ff_dim_t{nonnegative_int{0}}, 4},
+          {ff_dim_t{nonnegative_int{1}}, 5},
+          {ff_dim_t{nonnegative_int{2}}, 2},
+          {ff_dim_t{nonnegative_int{3}}, 7},
       };
 
       FFOrdered<int> result = ff_ordered_from_map(m);
diff --git a/lib/op-attrs/test/src/op-attrs/dim_ordered/slice.cc b/lib/op-attrs/test/src/op-attrs/dim_ordered/slice.cc
index 8d5f247756..b2fddd058e 100644
--- a/lib/op-attrs/test/src/op-attrs/dim_ordered/slice.cc
+++ b/lib/op-attrs/test/src/op-attrs/dim_ordered/slice.cc
@@ -4,22 +4,92 @@
 using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE(
-      "slice(DimOrdered<Idx, T>, std::optional<Idx>, std::optional<Idx>)") {
+  TEST_CASE("slice(FFOrdered<T>, ..., ...") {
     FFOrdered<size_t> d = FFOrdered<size_t>{
         1,
         2,
         3,
         4,
     };
+    SUBCASE("ff_dim_t, ff_dim_t") {
+      FFOrdered<size_t> result =
+          slice(d, ff_dim_t{nonnegative_int{1}}, ff_dim_t{nonnegative_int{3}});
+      FFOrdered<size_t> correct = FFOrdered<size_t>{2, 3};
 
-    FFOrdered<size_t> result = slice(d, std::nullopt, ff_dim_t{-1});
-    FFOrdered<size_t> correct = FFOrdered<size_t>{
-        1,
-        2,
-        3,
-    };
+      CHECK(result == correct);
+    }
+    SUBCASE("ff_dim_t, std::nullopt_t") {
+      FFOrdered<size_t> result =
+          slice(d, ff_dim_t{nonnegative_int{1}}, std::nullopt);
+      FFOrdered<size_t> correct = FFOrdered<size_t>{2, 3, 4};
+
+      CHECK(result == correct);
+    }
+    SUBCASE("std::nullopt_t, ff_dim_t") {
+      FFOrdered<size_t> result =
+          slice(d, std::nullopt, ff_dim_t{nonnegative_int{3}});
+      FFOrdered<size_t> correct = FFOrdered<size_t>{1, 2, 3};
+
+      CHECK(result == correct);
+    }
+    SUBCASE("relative_ff_dim_t, relative_ff_dim_t") {
+      FFOrdered<size_t> result =
+          slice(d, relative_ff_dim_t{1}, relative_ff_dim_t{-1});
+      FFOrdered<size_t> correct = FFOrdered<size_t>{2, 3};
+
+      CHECK(result == correct);
+    }
+    SUBCASE("relative_ff_dim_t, std::nullopt_t") {
+      FFOrdered<size_t> result = slice(d, relative_ff_dim_t{-3}, std::nullopt);
+      FFOrdered<size_t> correct = FFOrdered<size_t>{2, 3, 4};
+
+      CHECK(result == correct);
+    }
+    SUBCASE("std::nullopt_t, relative_ff_dim_t") {
+      FFOrdered<size_t> result = slice(d, std::nullopt, relative_ff_dim_t{-1});
+      FFOrdered<size_t> correct = FFOrdered<size_t>{1, 2, 3};
+
+      CHECK(result == correct);
+    }
+    SUBCASE("start index = stop index") {
+      FFOrdered<size_t> result =
+          slice(d, relative_ff_dim_t{1}, relative_ff_dim_t{1});
+      FFOrdered<size_t> correct = FFOrdered<size_t>{};
+
+      CHECK(result == correct);
+    }
+    SUBCASE("start index = stop index (using negative indexing)") {
+      FFOrdered<size_t> result =
+          slice(d, relative_ff_dim_t{1}, relative_ff_dim_t{-3});
+      FFOrdered<size_t> correct = FFOrdered<size_t>{};
+
+      CHECK(result == correct);
+    }
+    SUBCASE("start index > stop index") {
+      FFOrdered<size_t> result =
+          slice(d, relative_ff_dim_t{1}, relative_ff_dim_t{0});
+      FFOrdered<size_t> correct = FFOrdered<size_t>{};
+
+      CHECK(result == correct);
+    }
+    SUBCASE("start index > stop index (using negative indexing)") {
+      FFOrdered<size_t> result =
+          slice(d, relative_ff_dim_t{1}, relative_ff_dim_t{-4});
+      FFOrdered<size_t> correct = FFOrdered<size_t>{};
 
-    CHECK(result == correct);
+      CHECK(result == correct);
+    }
+    SUBCASE("start index out of bounds (too low)") {
+      CHECK_THROWS(slice(d, relative_ff_dim_t{-10}, std::nullopt));
+    }
+    SUBCASE("start index out of bounds (too high)") {
+      CHECK_THROWS(slice(d, relative_ff_dim_t{10}, std::nullopt));
+    }
+    SUBCASE("stop index out of bounds (too low)") {
+      CHECK_THROWS(slice(d, std::nullopt, relative_ff_dim_t{-10}));
+    }
+    SUBCASE("stop index out of bounds (too high)") {
+      CHECK_THROWS(slice(d, std::nullopt, relative_ff_dim_t{10}));
+    }
   }
 }
diff --git a/lib/op-attrs/test/src/op-attrs/dim_ordered/zip.cc b/lib/op-attrs/test/src/op-attrs/dim_ordered/zip.cc
index 8e3d0f1b80..b77bb8f71e 100644
--- a/lib/op-attrs/test/src/op-attrs/dim_ordered/zip.cc
+++ b/lib/op-attrs/test/src/op-attrs/dim_ordered/zip.cc
@@ -1,5 +1,5 @@
 #include "op-attrs/dim_ordered/zip.h"
-#include "op-attrs/ff_dim.dtg.h"
+#include "op-attrs/ff_dim_t.dtg.h"
 #include "test/utils/doctest/fmt/pair.h"
 #include <doctest/doctest.h>
 
diff --git a/lib/op-attrs/test/src/op-attrs/ff_dim_t.cc b/lib/op-attrs/test/src/op-attrs/ff_dim_t.cc
new file mode 100644
index 0000000000..9dc36f28ab
--- /dev/null
+++ b/lib/op-attrs/test/src/op-attrs/ff_dim_t.cc
@@ -0,0 +1,22 @@
+#include "op-attrs/ff_dim_t.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("relative_ff_dim_t_from_ff_dim_t") {
+    SUBCASE("absolute index is zero") {
+      ff_dim_t ff_dim = ff_dim_t{nonnegative_int{0}};
+      relative_ff_dim_t relative_ff_dim =
+          relative_ff_dim_t_from_ff_dim_t(ff_dim);
+      CHECK(relative_ff_dim == relative_ff_dim_t{0});
+    }
+
+    SUBCASE("absolute index is positive") {
+      ff_dim_t ff_dim = ff_dim_t{nonnegative_int{1}};
+      relative_ff_dim_t relative_ff_dim =
+          relative_ff_dim_t_from_ff_dim_t(ff_dim);
+      CHECK(relative_ff_dim == relative_ff_dim_t{1});
+    }
+  }
+}
diff --git a/lib/op-attrs/test/src/op-attrs/get_incoming_tensor_roles.cc b/lib/op-attrs/test/src/op-attrs/get_incoming_tensor_roles.cc
index 33cc00c6a1..4688ad4008 100644
--- a/lib/op-attrs/test/src/op-attrs/get_incoming_tensor_roles.cc
+++ b/lib/op-attrs/test/src/op-attrs/get_incoming_tensor_roles.cc
@@ -9,7 +9,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("Concat") {
       int num_incoming = 4;
       ComputationGraphOpAttrs attrs =
-          ComputationGraphOpAttrs{ConcatAttrs{ff_dim_t{0}}};
+          ComputationGraphOpAttrs{ConcatAttrs{ff_dim_t{nonnegative_int{0}}}};
 
       std::vector<IncomingTensorRole> result =
           get_incoming_tensor_roles(attrs, num_incoming);
diff --git a/lib/op-attrs/test/src/op-attrs/ops/combine.cc b/lib/op-attrs/test/src/op-attrs/ops/combine.cc
index bf74a072e0..577961b7b1 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/combine.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/combine.cc
@@ -24,7 +24,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     };
 
     SUBCASE("valid") {
-      ff_dim_t dim = ff_dim_t{2};
+      ff_dim_t dim = ff_dim_t{nonnegative_int{2}};
       int degree = 3;
       CombineAttrs attrs = CombineAttrs{
           /*repartition_dim=*/dim,
@@ -44,7 +44,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("invalid") {
-      ff_dim_t dim = ff_dim_t{2};
+      ff_dim_t dim = ff_dim_t{nonnegative_int{2}};
       int degree = 4;
       CombineAttrs attrs = CombineAttrs{
           /*repartition_dim=*/dim,
diff --git a/lib/op-attrs/test/src/op-attrs/ops/concat.cc b/lib/op-attrs/test/src/op-attrs/ops/concat.cc
index 9e842c3ebe..2d9842b1dd 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/concat.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/concat.cc
@@ -10,7 +10,7 @@ using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("get_output_shape(ConcatAttrs, std::vector<TensorShape>)") {
     ConcatAttrs attrs = ConcatAttrs{
-        ff_dim_t{1},
+        ff_dim_t{nonnegative_int{1}},
     };
 
     SUBCASE("empty input shapes list passed") {
@@ -81,7 +81,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("concat axis is out of bounds") {
       attrs = ConcatAttrs{
-          ff_dim_t{3},
+          ff_dim_t{nonnegative_int{3}},
       };
 
       std::vector<TensorShape> input_shapes = {
@@ -115,7 +115,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
   TEST_CASE("get_output_shape(ConcatAttrs, std::vector<ParallelTensorShape>)") {
     ConcatAttrs attrs = ConcatAttrs{
-        ff_dim_t{1},
+        ff_dim_t{nonnegative_int{1}},
     };
 
     size_t dim0_size = 12;
diff --git a/lib/op-attrs/test/src/op-attrs/ops/element_binary.cc b/lib/op-attrs/test/src/op-attrs/ops/element_binary.cc
index b091833f10..d5aab55cb2 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/element_binary.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/element_binary.cc
@@ -41,7 +41,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("mismatched dim size") {
       TensorShape incorrect_rhs = input_lhs;
-      dim_at_idx(incorrect_rhs, ff_dim_t{0}) += 1;
+      dim_at_idx(incorrect_rhs, relative_ff_dim_t{0}) += 1;
 
       tl::expected<TensorShape, std::string> result =
           get_output_shape(attrs, input_lhs, incorrect_rhs);
diff --git a/lib/op-attrs/test/src/op-attrs/ops/flat.cc b/lib/op-attrs/test/src/op-attrs/ops/flat.cc
index d81ab95c35..8998dfaffd 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/flat.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/flat.cc
@@ -20,8 +20,8 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("flatten all dims") {
       FlatAttrs attrs = FlatAttrs{
-          /*start_dim=*/ff_dim_t{0},
-          /*end_dim=*/ff_dim_t{4},
+          /*start_dim=*/ff_dim_t{nonnegative_int{0}},
+          /*end_dim=*/ff_dim_t{nonnegative_int{4}},
       };
 
       TensorShape result = get_output_shape(attrs, input_shape);
@@ -37,8 +37,8 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("flatten trailing dims") {
       FlatAttrs attrs = FlatAttrs{
-          /*start_dim=*/ff_dim_t{2},
-          /*end_dim=*/ff_dim_t{4},
+          /*start_dim=*/ff_dim_t{nonnegative_int{2}},
+          /*end_dim=*/ff_dim_t{nonnegative_int{4}},
       };
 
       TensorShape result = get_output_shape(attrs, input_shape);
@@ -56,8 +56,8 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("flatten leading dims") {
       FlatAttrs attrs = FlatAttrs{
-          /*start_dim=*/ff_dim_t{0},
-          /*end_dim=*/ff_dim_t{2},
+          /*start_dim=*/ff_dim_t{nonnegative_int{0}},
+          /*end_dim=*/ff_dim_t{nonnegative_int{2}},
       };
 
       TensorShape result = get_output_shape(attrs, input_shape);
@@ -75,8 +75,8 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("flatten middle dims") {
       FlatAttrs attrs = FlatAttrs{
-          /*start_dim=*/ff_dim_t{1},
-          /*end_dim=*/ff_dim_t{3},
+          /*start_dim=*/ff_dim_t{nonnegative_int{1}},
+          /*end_dim=*/ff_dim_t{nonnegative_int{3}},
       };
 
       TensorShape result = get_output_shape(attrs, input_shape);
@@ -94,8 +94,8 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("flatten no dims (start_dim == end_dim)") {
       FlatAttrs attrs = FlatAttrs{
-          /*start_dim=*/ff_dim_t{2},
-          /*end_dim=*/ff_dim_t{2},
+          /*start_dim=*/ff_dim_t{nonnegative_int{2}},
+          /*end_dim=*/ff_dim_t{nonnegative_int{2}},
       };
 
       TensorShape result = get_output_shape(attrs, input_shape);
@@ -106,8 +106,8 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("flatten no dims (start_dim < end_dim)") {
       FlatAttrs attrs = FlatAttrs{
-          /*start_dim=*/ff_dim_t{2},
-          /*end_dim=*/ff_dim_t{1},
+          /*start_dim=*/ff_dim_t{nonnegative_int{2}},
+          /*end_dim=*/ff_dim_t{nonnegative_int{1}},
       };
 
       TensorShape result = get_output_shape(attrs, input_shape);
@@ -119,8 +119,8 @@ TEST_SUITE(FF_TEST_SUITE) {
 
   TEST_CASE(
       "get_output_parallel_dim_degrees(FlatAttrs, ParallelTensorDimDegrees)") {
-    FlatAttrs attrs = FlatAttrs{/*start_dim=*/ff_dim_t{1},
-                                /*end_dim=*/ff_dim_t{3}};
+    FlatAttrs attrs = FlatAttrs{/*start_dim=*/ff_dim_t{nonnegative_int{1}},
+                                /*end_dim=*/ff_dim_t{nonnegative_int{3}}};
 
     SUBCASE("allows shard parallelism in non-flattened dims") {
       ParallelTensorDimDegrees input = ParallelTensorDimDegrees{
@@ -217,8 +217,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     };
 
     FlatAttrs attrs = FlatAttrs{
-        /*start_dim=*/ff_dim_t{1},
-        /*end_dim=*/ff_dim_t{3},
+        /*start_dim=*/ff_dim_t{nonnegative_int{1}},
+        /*end_dim=*/ff_dim_t{nonnegative_int{3}},
     };
 
     tl::expected<ParallelTensorShape, std::string> result =
diff --git a/lib/op-attrs/test/src/op-attrs/ops/layer_norm.cc b/lib/op-attrs/test/src/op-attrs/ops/layer_norm.cc
index f45ea91dac..b9426a89a2 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/layer_norm.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/layer_norm.cc
@@ -11,7 +11,7 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("get_layer_norm_incoming_tensor_roles(LayerNormAttrs)") {
     auto make_attrs = [](bool elementwise_affine) {
       return LayerNormAttrs{
-          /*axes=*/{ff_dim_t{0}, ff_dim_t{2}},
+          /*axes=*/{ff_dim_t{nonnegative_int{0}}, ff_dim_t{nonnegative_int{2}}},
           elementwise_affine,
           /*eps=*/1.0,
       };
@@ -46,7 +46,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
   TEST_CASE("shape inference (LayerNorm)") {
     LayerNormAttrs attrs_affine_true = LayerNormAttrs{
-        /*axes=*/{ff_dim_t{1}, ff_dim_t{3}},
+        /*axes=*/{ff_dim_t{nonnegative_int{1}}, ff_dim_t{nonnegative_int{3}}},
         /*elementwise_affine=*/true,
         /*eps=*/0.1,
     };
diff --git a/lib/op-attrs/test/src/op-attrs/ops/repartition.cc b/lib/op-attrs/test/src/op-attrs/ops/repartition.cc
index 8bc8205183..ba213f54f4 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/repartition.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/repartition.cc
@@ -6,7 +6,7 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Repartition shape inference") {
-    ff_dim_t dim = ff_dim_t{2};
+    ff_dim_t dim = ff_dim_t{nonnegative_int{2}};
     int degree = 4;
     RepartitionAttrs attrs = RepartitionAttrs{
         /*repartition_dim=*/dim,
diff --git a/lib/op-attrs/test/src/op-attrs/ops/softmax.cc b/lib/op-attrs/test/src/op-attrs/ops/softmax.cc
index 65a74932cb..5808e5ef42 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/softmax.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/softmax.cc
@@ -19,7 +19,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     };
 
     SUBCASE("attrs.dim in bounds") {
-      SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{1}};
+      SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{nonnegative_int{1}}};
 
       tl::expected<TensorShape, std::string> result =
           get_output_shape(attrs, input);
@@ -29,7 +29,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("attrs.dims out of bounds") {
-      SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{4}};
+      SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{nonnegative_int{4}}};
 
       std::optional<TensorShape> result =
           optional_from_expected(get_output_shape(attrs, input));
@@ -70,7 +70,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           make_input(SumDegree{1}, DiscardCopyDegree{1}, degree0, 1, degree2);
 
       SUBCASE("attrs.dim in bounds") {
-        SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{1}};
+        SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{nonnegative_int{1}}};
 
         tl::expected<ParallelTensorShape, std::string> result =
             get_output_shape(attrs, par_input);
@@ -81,7 +81,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       }
 
       SUBCASE("attrs.dims out of bounds") {
-        SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{4}};
+        SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{nonnegative_int{4}}};
 
         std::optional<ParallelTensorShape> result =
             optional_from_expected(get_output_shape(attrs, par_input));
@@ -94,7 +94,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("partition parallism in softmax dim (invalid)") {
       int degree1 = 2;
 
-      SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{1}};
+      SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{nonnegative_int{1}}};
 
       ParallelTensorShape par_input =
           make_input(SumDegree{1}, DiscardCopyDegree{1}, 1, degree1, 1);
@@ -109,7 +109,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("sum parallelism (invalid)") {
       SumDegree sum_degree = SumDegree{2};
 
-      SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{1}};
+      SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{nonnegative_int{1}}};
 
       ParallelTensorShape par_input =
           make_input(sum_degree, DiscardCopyDegree{1}, 1, 1, 1);
@@ -124,7 +124,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("discard copy parallelism (invalid)") {
       DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{2};
 
-      SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{1}};
+      SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{nonnegative_int{1}}};
 
       ParallelTensorShape par_input =
           make_input(SumDegree{1}, discard_copy_degree, 1, 1, 1);
diff --git a/lib/op-attrs/test/src/op-attrs/pcg_operator_attrs.cc b/lib/op-attrs/test/src/op-attrs/pcg_operator_attrs.cc
index ebeaec4d19..73f5f0674d 100644
--- a/lib/op-attrs/test/src/op-attrs/pcg_operator_attrs.cc
+++ b/lib/op-attrs/test/src/op-attrs/pcg_operator_attrs.cc
@@ -6,7 +6,7 @@ using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("PCGOperatorAttrs to/from json") {
     PCGOperatorAttrs correct = PCGOperatorAttrs{RepartitionAttrs{
-        /*repartition_dim=*/ff_dim_t{1},
+        /*repartition_dim=*/ff_dim_t{nonnegative_int{1}},
         /*repartition_degree=*/4,
     }};
     nlohmann::json j = correct;
diff --git a/lib/op-attrs/test/src/op-attrs/relative_ff_dim_t.cc b/lib/op-attrs/test/src/op-attrs/relative_ff_dim_t.cc
new file mode 100644
index 0000000000..c09c1ec3df
--- /dev/null
+++ b/lib/op-attrs/test/src/op-attrs/relative_ff_dim_t.cc
@@ -0,0 +1,50 @@
+#include "op-attrs/relative_ff_dim_t.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("ff_dim_t_from_relative_ff_dim_t") {
+    int input_dim = 5;
+
+    SUBCASE("relative index is zero") {
+      relative_ff_dim_t relative_ff_dim = relative_ff_dim_t{0};
+      ff_dim_t ff_dim =
+          ff_dim_t_from_relative_ff_dim_t(relative_ff_dim, input_dim);
+      CHECK(ff_dim == ff_dim_t{nonnegative_int{0}});
+    }
+
+    SUBCASE("relative index is positive") {
+
+      SUBCASE("relative index is in range") {
+        relative_ff_dim_t relative_ff_dim = relative_ff_dim_t{1};
+        ff_dim_t ff_dim =
+            ff_dim_t_from_relative_ff_dim_t(relative_ff_dim, input_dim);
+        CHECK(ff_dim == ff_dim_t{nonnegative_int{1}});
+      }
+
+      SUBCASE("relative index is out of range") {
+        relative_ff_dim_t relative_ff_dim = relative_ff_dim_t{10};
+        ff_dim_t ff_dim =
+            ff_dim_t_from_relative_ff_dim_t(relative_ff_dim, input_dim);
+        CHECK(ff_dim == ff_dim_t{nonnegative_int{10}});
+      }
+    }
+
+    SUBCASE("relative index is negative") {
+
+      SUBCASE("relative index is in range") {
+        relative_ff_dim_t relative_ff_dim = relative_ff_dim_t{-1};
+        ff_dim_t ff_dim =
+            ff_dim_t_from_relative_ff_dim_t(relative_ff_dim, input_dim);
+        CHECK(ff_dim == ff_dim_t{nonnegative_int{4}});
+      }
+
+      SUBCASE("relative index is out of range") {
+        relative_ff_dim_t relative_ff_dim = relative_ff_dim_t{-10};
+        CHECK_THROWS(
+            ff_dim_t_from_relative_ff_dim_t(relative_ff_dim, input_dim));
+      }
+    }
+  }
+}
diff --git a/lib/pcg/include/pcg/computation_graph_builder.h b/lib/pcg/include/pcg/computation_graph_builder.h
index 45cde0de57..df93f69f2e 100644
--- a/lib/pcg/include/pcg/computation_graph_builder.h
+++ b/lib/pcg/include/pcg/computation_graph_builder.h
@@ -116,7 +116,7 @@ struct ComputationGraphBuilder {
   // Add a gather layer
   tensor_guid_t gather(tensor_guid_t const &input,
                        tensor_guid_t const &index,
-                       ff_dim_t dim,
+                       relative_ff_dim_t dim,
                        std::optional<std::string> const &name = std::nullopt);
   // Add a cache layer
   tensor_guid_t
diff --git a/lib/pcg/src/pcg/computation_graph_builder.cc b/lib/pcg/src/pcg/computation_graph_builder.cc
index 691926e01e..61f7b76ff0 100644
--- a/lib/pcg/src/pcg/computation_graph_builder.cc
+++ b/lib/pcg/src/pcg/computation_graph_builder.cc
@@ -20,6 +20,7 @@
 #include "op-attrs/ops/pool_2d.h"
 #include "op-attrs/ops/softmax.h"
 #include "op-attrs/ops/weight_attrs.dtg.h"
+#include "op-attrs/relative_ff_dim_t.h"
 #include "op-attrs/tensor_dims.h"
 #include "pcg/computation_graph.h"
 #include "utils/containers/any_of.h"
@@ -28,6 +29,7 @@
 #include "utils/containers/get_only.h"
 #include "utils/containers/transform.h"
 #include "utils/expected.h"
+#include "utils/stack_vector/stack_vector_of.h"
 #include <fmt/format.h>
 
 namespace FlexFlow {
@@ -491,13 +493,8 @@ tensor_guid_t ComputationGraphBuilder::embedding(
 tensor_guid_t ComputationGraphBuilder::gather(
     tensor_guid_t const &input,
     tensor_guid_t const &index,
-    ff_dim_t dim,
+    relative_ff_dim_t dim,
     std::optional<std::string> const &maybe_name) {
-  GatherAttrs attrs = GatherAttrs{dim};
-  std::string name =
-      maybe_name.value_or(get_default_name(ComputationGraphOpAttrs{attrs}));
-
-  LayerAttrs layer = LayerAttrs{ComputationGraphOpAttrs{attrs}, name};
   if (this->get_shape(index).data_type != DataType::INT32 &&
       this->get_shape(index).data_type != DataType::INT64) {
     throw mk_runtime_error(
@@ -507,6 +504,13 @@ tensor_guid_t ComputationGraphBuilder::gather(
                     DataType::INT32,
                     DataType::INT64));
   }
+
+  GatherAttrs attrs = GatherAttrs{
+      ff_dim_t_from_relative_ff_dim_t(dim, num_dims(this->get_shape(input)))};
+  std::string name =
+      maybe_name.value_or(get_default_name(ComputationGraphOpAttrs{attrs}));
+
+  LayerAttrs layer = LayerAttrs{ComputationGraphOpAttrs{attrs}, name};
   TensorShape output_shape =
       get_output_shape(attrs, this->get_shape(input), this->get_shape(index));
 
@@ -803,7 +807,9 @@ tensor_guid_t ComputationGraphBuilder::concat(
     int axis,
     std::optional<std::string> const &maybe_name) {
 
-  ConcatAttrs attrs = ConcatAttrs{ff_dim_t{axis}};
+  relative_ff_dim_t wrapped_axis = relative_ff_dim_t{axis};
+  ConcatAttrs attrs = ConcatAttrs{ff_dim_t_from_relative_ff_dim_t(
+      wrapped_axis, num_dims(this->get_shape(inputs[0])))};
 
   std::string name =
       maybe_name.value_or(get_default_name(ComputationGraphOpAttrs{attrs}));
@@ -827,8 +833,11 @@ tensor_guid_t ComputationGraphBuilder::flat(
   int input_num_dims = num_dims(this->get_shape(input));
 
   FlatAttrs attrs = FlatAttrs{
-      /*start_dim=*/ff_dim_t{start_dim},
-      /*end_dim=*/ff_dim_t{end_dim.value_or(input_num_dims)},
+      /*start_dim=*/ff_dim_t_from_relative_ff_dim_t(
+          relative_ff_dim_t{start_dim}, input_num_dims),
+      /*end_dim=*/
+      ff_dim_t_from_relative_ff_dim_t(
+          relative_ff_dim_t{end_dim.value_or(input_num_dims)}, input_num_dims),
   };
 
   std::string name =
@@ -844,15 +853,24 @@ tensor_guid_t ComputationGraphBuilder::flat(
 
 tensor_guid_t ComputationGraphBuilder::layer_norm(
     tensor_guid_t const &input,
-    std::vector<int> const &axes,
+    std::vector<int> const &relative_axes,
     bool elementwise_affine,
     float eps,
     std::optional<std::string> const &maybe_name) {
 
   TensorShape input_shape = this->get_shape(input);
 
-  if (any_of(axes,
-             [&](size_t axis) { return axis >= num_dims(input_shape); })) {
+  auto resolve_dim_idx = [&](int dim_idx) {
+    return ff_dim_t_from_relative_ff_dim_t(relative_ff_dim_t{dim_idx},
+                                           num_dims(input_shape));
+  };
+
+  stack_vector<ff_dim_t, MAX_TENSOR_DIM> axes = stack_vector_of<MAX_TENSOR_DIM>(
+      transform(relative_axes, resolve_dim_idx));
+
+  if (any_of(axes, [&](ff_dim_t axis) {
+        return axis.value >= num_dims(input_shape);
+      })) {
     throw mk_runtime_error(fmt::format(
         "ComputationGraphBuilder::layer_norm received axes {} with "
         "out-of-bound element (input tensor has num dimensions = {})",
@@ -861,7 +879,7 @@ tensor_guid_t ComputationGraphBuilder::layer_norm(
   }
 
   LayerNormAttrs attrs = LayerNormAttrs{
-      stack_vector<ff_dim_t, MAX_TENSOR_DIM>{axes.begin(), axes.end()},
+      axes,
       elementwise_affine,
       eps,
   };
@@ -909,16 +927,17 @@ tensor_guid_t ComputationGraphBuilder::softmax(
 
   int dim = maybe_dim.value_or(num_dims(input_shape) - 1);
 
-  if (dim >= num_dims(input_shape)) {
+  SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t_from_relative_ff_dim_t(
+      relative_ff_dim_t{dim}, num_dims(input_shape))};
+
+  if (attrs.dim.value >= num_dims(input_shape)) {
     throw mk_runtime_error(
         fmt::format("ComputationGraphBuilder::softmax received out-of-bounds "
                     "dim {} for input tensor shape {}",
-                    dim,
+                    attrs.dim.value,
                     input_shape));
   }
 
-  SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{dim}};
-
   std::string name =
       maybe_name.value_or(get_default_name(ComputationGraphOpAttrs{attrs}));
 
diff --git a/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc b/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc
index 20bd0ac92d..3f66b33b6e 100644
--- a/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc
+++ b/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc
@@ -492,7 +492,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     };
 
     parallel_tensor_guid_t input = b.create_input_tensor(input_shape);
-    parallel_tensor_guid_t output = b.parallel_partition(input, ff_dim_t{0}, 2);
+    parallel_tensor_guid_t output =
+        b.parallel_partition(input, ff_dim_t{nonnegative_int{0}}, 2);
     parallel_layer_guid_t layer = get_source_layer(output);
 
     SUBCASE("incoming") {
@@ -531,7 +532,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     };
 
     parallel_tensor_guid_t input = b.create_input_tensor(input_shape);
-    parallel_tensor_guid_t output = b.parallel_combine(input, ff_dim_t{0}, 2);
+    parallel_tensor_guid_t output =
+        b.parallel_combine(input, ff_dim_t{nonnegative_int{0}}, 2);
     parallel_layer_guid_t layer = get_source_layer(output);
 
     SUBCASE("incoming") {
diff --git a/lib/runtime/src/operator.h b/lib/runtime/src/operator.h
index c44daa5029..2db40de78b 100644
--- a/lib/runtime/src/operator.h
+++ b/lib/runtime/src/operator.h
@@ -9,7 +9,7 @@
 #include "runtime/config.h"
 #include "tasks.h"
 #include "utils/stack_string.h"
-#include "utils/stack_vector.h"
+#include "utils/stack_vector/stack_vector.h"
 #include "utils/strong_typedef.h"
 #include <stdexcept>
 #include <vector>
diff --git a/lib/runtime/src/ops/embedding.cc b/lib/runtime/src/ops/embedding.cc
index 2370739d58..253fd3cb4f 100644
--- a/lib/runtime/src/ops/embedding.cc
+++ b/lib/runtime/src/ops/embedding.cc
@@ -85,7 +85,7 @@ static std::optional<float>
                  attrs.aggr,
                  input.shape.get_dim(),
                  output.shape.get_dim(),
-                 input.shape.at(ff_dim_t(0)));
+                 input.shape.at(ff_dim_t{nonnegative_int{0}}));
 }
 
 TaskImplFunction get_embedding_fwd_task_impl() {
diff --git a/lib/runtime/src/parallel_op_info.h b/lib/runtime/src/parallel_op_info.h
index ebd44f012b..49ad22be74 100644
--- a/lib/runtime/src/parallel_op_info.h
+++ b/lib/runtime/src/parallel_op_info.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_PARALLEL_OPS_PARALLEL_OP_INFO_H
 #define _FLEXFLOW_PARALLEL_OPS_PARALLEL_OP_INFO_H
 
-#include "op-attrs/ff_dim.h"
+#include "op-attrs/ff_dim_t.h"
 #include "op-attrs/operator_type.h"
 #include "utils/visitable.h"
 #include <functional>
diff --git a/lib/runtime/src/realm_allocator.h b/lib/runtime/src/realm_allocator.h
index 5ac1b55de5..315cb5f201 100644
--- a/lib/runtime/src/realm_allocator.h
+++ b/lib/runtime/src/realm_allocator.h
@@ -3,7 +3,7 @@
 
 #include "kernels/allocation.h"
 #include "legion.h"
-#include "utils/stack_vector.h"
+#include "utils/stack_vector/stack_vector.h"
 #include <memory>
 
 #define MAX_INSTANCE_ALLOCATIONS 1
diff --git a/lib/runtime/src/tensor.h b/lib/runtime/src/tensor.h
index c7eabacdec..937b14a129 100644
--- a/lib/runtime/src/tensor.h
+++ b/lib/runtime/src/tensor.h
@@ -25,7 +25,7 @@
 #include "op-attrs/param_sync.h"
 #include "op-attrs/tensor_shape.h"
 #include "utils/optional.h"
-#include "utils/stack_vector.h"
+#include "utils/stack_vector/stack_vector.h"
 #include <memory>
 #include <type_traits>
 #include <unordered_map>
diff --git a/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_value.variant.toml b/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_value.variant.toml
index 02a856f59a..8fe4a9494d 100644
--- a/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_value.variant.toml
+++ b/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_value.variant.toml
@@ -13,7 +13,7 @@ includes = [
   "<vector>",
   "<optional>",
   "op-attrs/operator_type.dtg.h",
-  "op-attrs/ff_dim.dtg.h",
+  "op-attrs/ff_dim_t.dtg.h",
   "op-attrs/activation.dtg.h",
   "op-attrs/aggregate_op.dtg.h",
   "op-attrs/regularizer_attrs.dtg.h",
diff --git a/lib/substitutions/test/src/test_substitution.cc b/lib/substitutions/test/src/test_substitution.cc
index 344954c553..dcb06a78fa 100644
--- a/lib/substitutions/test/src/test_substitution.cc
+++ b/lib/substitutions/test/src/test_substitution.cc
@@ -52,7 +52,8 @@ using namespace FlexFlow;
 //     OperatorAttrAssignment op_ass_n1{
 //         {{OperatorAttributeKey::OP_TYPE,
 //           AttrConstant{OperatorType::REPARTITION}},
-//          {OperatorAttributeKey::PARALLEL_DIM, AttrConstant{ff_dim_t{0}}},
+//          {OperatorAttributeKey::PARALLEL_DIM,
+//          AttrConstant{ff_dim_t{nonnegative_int{0}}}},
 //          {OperatorAttributeKey::PARALLEL_DEGREE, AttrConstant{2}}}};
 //
 //     OperatorAttrAssignment op_ass_n2{
@@ -71,7 +72,8 @@ using namespace FlexFlow;
 //     OperatorAttrAssignment op_ass_n3{
 //         {{OperatorAttributeKey::OP_TYPE,
 //         AttrConstant{OperatorType::REDUCTION}},
-//          {OperatorAttributeKey::PARALLEL_DIM, AttrConstant{ff_dim_t{0}}},
+//          {OperatorAttributeKey::PARALLEL_DIM,
+//          AttrConstant{ff_dim_t{nonnegative_int{0}}}},
 //          {OperatorAttributeKey::PARALLEL_DEGREE, AttrConstant{2}}}};
 //
 //     auto og = NodeLabelledOpenMultiDiGraph<OperatorAttrAssignment>::create<
diff --git a/lib/utils/include/utils/archetypes/ordered_value_type.h b/lib/utils/include/utils/archetypes/ordered_value_type.h
new file mode 100644
index 0000000000..56cbb77d0b
--- /dev/null
+++ b/lib/utils/include/utils/archetypes/ordered_value_type.h
@@ -0,0 +1,55 @@
+#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_ARCHETYPES_ORDERED_VALUE_TYPE_H
+#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_ARCHETYPES_ORDERED_VALUE_TYPE_H
+
+#include <cassert>
+#include <functional>
+
+namespace FlexFlow {
+
+template <int TAG>
+struct ordered_value_type {
+  ordered_value_type() = delete;
+
+  ordered_value_type(ordered_value_type const &) {
+    assert(false);
+  }
+  ordered_value_type &operator=(ordered_value_type const &) {
+    assert(false);
+  }
+
+  ordered_value_type(ordered_value_type &&) {
+    assert(false);
+  }
+  ordered_value_type &operator=(ordered_value_type &&) {
+    assert(false);
+  }
+
+  bool operator==(ordered_value_type const &) const {
+    assert(false);
+  }
+  bool operator!=(ordered_value_type const &) const {
+    assert(false);
+  }
+
+  bool operator<(ordered_value_type const &) const {
+    assert(false);
+  }
+  bool operator>(ordered_value_type const &) const {
+    assert(false);
+  }
+};
+
+} // namespace FlexFlow
+
+namespace std {
+
+template <int TAG>
+struct hash<::FlexFlow::ordered_value_type<TAG>> {
+  size_t operator()(::FlexFlow::ordered_value_type<TAG> const &) const {
+    assert(false);
+  };
+};
+
+} // namespace std
+
+#endif
diff --git a/lib/utils/include/utils/containers/recurse_n.h b/lib/utils/include/utils/containers/recurse_n.h
new file mode 100644
index 0000000000..8dc22cb8a8
--- /dev/null
+++ b/lib/utils/include/utils/containers/recurse_n.h
@@ -0,0 +1,34 @@
+#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_RECURSE_N_H
+#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_RECURSE_N_H
+
+#include "utils/exception.h"
+
+namespace FlexFlow {
+
+/**
+ * @brief
+ * Applies function `f` to value `initial_value` n times recursively.
+ *
+ * @example
+ *   auto add_three = [](int x) { return x + 3; };
+ *   int result = recurse_n(add_three, 3, 5);
+ *   result -> f(f(f(5))) = ((5+3)+3)+3 = 14
+ *
+ * @throws RuntimeError if n is negative
+ */
+template <typename F, typename T>
+T recurse_n(F const &f, int n, T const &initial_value) {
+  if (n < 0) {
+    throw mk_runtime_error(
+        fmt::format("Supplied n={} should be non-negative", n));
+  }
+  T t = initial_value;
+  for (int i = 0; i < n; i++) {
+    t = f(t);
+  }
+  return t;
+}
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/utils/include/utils/nonnegative_int/nonnegative_int.h b/lib/utils/include/utils/nonnegative_int/nonnegative_int.h
new file mode 100644
index 0000000000..0749497c56
--- /dev/null
+++ b/lib/utils/include/utils/nonnegative_int/nonnegative_int.h
@@ -0,0 +1,69 @@
+#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_NONNEGATIVE_INT_NONNEGATIVE_INT_H
+#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_NONNEGATIVE_INT_NONNEGATIVE_INT_H
+
+#include "rapidcheck.h"
+
+#include <any>
+#include <fmt/format.h>
+#include <functional>
+#include <nlohmann/json.hpp>
+#include <string>
+
+namespace FlexFlow {
+class nonnegative_int {
+public:
+  nonnegative_int() = delete;
+  explicit nonnegative_int(int value);
+
+  explicit operator int() const noexcept;
+
+  bool operator<(nonnegative_int const &other) const;
+  bool operator==(nonnegative_int const &other) const;
+  bool operator>(nonnegative_int const &other) const;
+  bool operator<=(nonnegative_int const &other) const;
+  bool operator!=(nonnegative_int const &other) const;
+  bool operator>=(nonnegative_int const &other) const;
+
+  bool operator<(int const &other) const;
+  bool operator==(int const &other) const;
+  bool operator>(int const &other) const;
+  bool operator<=(int const &other) const;
+  bool operator!=(int const &other) const;
+  bool operator>=(int const &other) const;
+
+  friend bool operator<(int const &lhs, nonnegative_int const &rhs);
+  friend bool operator==(int const &lhs, nonnegative_int const &rhs);
+  friend bool operator>(int const &lhs, nonnegative_int const &rhs);
+  friend bool operator<=(int const &lhs, nonnegative_int const &rhs);
+  friend bool operator!=(int const &lhs, nonnegative_int const &rhs);
+  friend bool operator>=(int const &lhs, nonnegative_int const &rhs);
+
+  nonnegative_int operator+(nonnegative_int const &other) const;
+
+  friend std::ostream &operator<<(std::ostream &os, nonnegative_int const &n);
+
+  friend int format_as(nonnegative_int const &);
+
+  int get_value() const;
+
+private:
+  int value_;
+};
+} // namespace FlexFlow
+
+namespace nlohmann {
+template <>
+struct adl_serializer<::FlexFlow::nonnegative_int> {
+  static ::FlexFlow::nonnegative_int from_json(json const &j);
+  static void to_json(json &j, ::FlexFlow::nonnegative_int t);
+};
+} // namespace nlohmann
+
+namespace std {
+template <>
+struct hash<::FlexFlow::nonnegative_int> {
+  std::size_t operator()(FlexFlow::nonnegative_int const &n) const noexcept;
+};
+} // namespace std
+
+#endif
diff --git a/lib/utils/include/utils/stack_map.h b/lib/utils/include/utils/stack_map.h
index c70842de7e..f26deee92d 100644
--- a/lib/utils/include/utils/stack_map.h
+++ b/lib/utils/include/utils/stack_map.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_UTILS_STACK_MAP_H
 #define _FLEXFLOW_UTILS_STACK_MAP_H
 
-#include "utils/stack_vector.h"
+#include "utils/stack_vector/stack_vector.h"
 
 namespace std {
 
diff --git a/lib/utils/include/utils/stack_string.h b/lib/utils/include/utils/stack_string.h
index 2a7b2d1849..c1ab3f4570 100644
--- a/lib/utils/include/utils/stack_string.h
+++ b/lib/utils/include/utils/stack_string.h
@@ -2,7 +2,7 @@
 #define _FLEXFLOW_UTILS_INCLUDE_STACK_STRING_H
 
 #include "fmt/core.h"
-#include "stack_vector.h"
+#include "stack_vector/stack_vector.h"
 #include "utils/fmt.h"
 #include "utils/type_traits.h"
 #include <cstring>
@@ -57,9 +57,8 @@ struct stack_basic_string {
 
   friend struct std::hash<stack_basic_string>;
 
-  friend fmt::basic_string_view<Char>
-      format_as(stack_basic_string<Char, MAXSIZE> const &s) {
-    return {s.contents.data(), s.length()};
+  friend std::string format_as(stack_basic_string<Char, MAXSIZE> const &s) {
+    return {s.contents.cbegin(), s.contents.cend()};
   }
 
 private:
diff --git a/lib/utils/include/utils/stack_vector.h b/lib/utils/include/utils/stack_vector/stack_vector.h
similarity index 99%
rename from lib/utils/include/utils/stack_vector.h
rename to lib/utils/include/utils/stack_vector/stack_vector.h
index 7a7bce7afc..5d4d6eaad3 100644
--- a/lib/utils/include/utils/stack_vector.h
+++ b/lib/utils/include/utils/stack_vector/stack_vector.h
@@ -292,10 +292,6 @@ struct stack_vector {
     return (this->m_size == 0);
   }
 
-  T const *data() const {
-    return this->contents.data();
-  }
-
   friend std::string format_as(stack_vector<T, MAXSIZE> const &v) {
     CHECK_FMTABLE(T);
 
diff --git a/lib/utils/include/utils/stack_vector/stack_vector_of.h b/lib/utils/include/utils/stack_vector/stack_vector_of.h
new file mode 100644
index 0000000000..143e59a626
--- /dev/null
+++ b/lib/utils/include/utils/stack_vector/stack_vector_of.h
@@ -0,0 +1,16 @@
+#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_STACK_VECTOR_STACK_VECTOR_OF_H
+#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_STACK_VECTOR_STACK_VECTOR_OF_H
+
+#include "stack_vector.h"
+
+namespace FlexFlow {
+
+template <size_t MAX_SIZE, typename C, typename E = typename C::value_type>
+stack_vector<E, MAX_SIZE> stack_vector_of(C const &c) {
+  stack_vector<E, MAX_SIZE> result(c.cbegin(), c.cend());
+  return result;
+}
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/utils/src/stack_vector.cc b/lib/utils/src/stack_vector.cc
deleted file mode 100644
index daecd41a85..0000000000
--- a/lib/utils/src/stack_vector.cc
+++ /dev/null
@@ -1 +0,0 @@
-#include "utils/stack_vector.h"
diff --git a/lib/utils/src/utils/archetypes/ordered_value_type.cc b/lib/utils/src/utils/archetypes/ordered_value_type.cc
new file mode 100644
index 0000000000..572a03e3cf
--- /dev/null
+++ b/lib/utils/src/utils/archetypes/ordered_value_type.cc
@@ -0,0 +1,7 @@
+#include "utils/archetypes/ordered_value_type.h"
+
+namespace FlexFlow {
+
+template struct ordered_value_type<0>;
+
+} // namespace FlexFlow
diff --git a/lib/utils/src/utils/containers/recurse_n.cc b/lib/utils/src/utils/containers/recurse_n.cc
new file mode 100644
index 0000000000..182db6fd73
--- /dev/null
+++ b/lib/utils/src/utils/containers/recurse_n.cc
@@ -0,0 +1,12 @@
+#include "utils/containers/recurse_n.h"
+#include "utils/archetypes/value_type.h"
+#include <functional>
+
+namespace FlexFlow {
+
+using T = value_type<0>;
+using F = std::function<T(T)>; // F :: T -> T
+
+template T recurse_n(F const &f, int n, T const &initial_value);
+
+} // namespace FlexFlow
diff --git a/lib/utils/src/utils/nonnegative_int/nonnegative_int.cc b/lib/utils/src/utils/nonnegative_int/nonnegative_int.cc
new file mode 100644
index 0000000000..9088cc4bf9
--- /dev/null
+++ b/lib/utils/src/utils/nonnegative_int/nonnegative_int.cc
@@ -0,0 +1,109 @@
+#include "utils/nonnegative_int/nonnegative_int.h"
+
+namespace FlexFlow {
+
+nonnegative_int::nonnegative_int(int value) {
+  if (value < 0) {
+    throw std::invalid_argument(
+        "Value of nonnegative_int type must be nonnegative.");
+  }
+  this->value_ = value;
+}
+
+nonnegative_int::operator int() const noexcept {
+  return this->value_;
+}
+
+bool nonnegative_int::operator<(nonnegative_int const &other) const {
+  return this->value_ < other.value_;
+}
+bool nonnegative_int::operator==(nonnegative_int const &other) const {
+  return this->value_ == other.value_;
+}
+bool nonnegative_int::operator>(nonnegative_int const &other) const {
+  return this->value_ > other.value_;
+}
+bool nonnegative_int::operator<=(nonnegative_int const &other) const {
+  return this->value_ <= other.value_;
+}
+bool nonnegative_int::operator!=(nonnegative_int const &other) const {
+  return this->value_ != other.value_;
+}
+bool nonnegative_int::operator>=(nonnegative_int const &other) const {
+  return this->value_ >= other.value_;
+}
+
+bool nonnegative_int::operator<(int const &other) const {
+  return this->value_ < other;
+}
+bool nonnegative_int::operator==(int const &other) const {
+  return this->value_ == other;
+}
+bool nonnegative_int::operator>(int const &other) const {
+  return this->value_ > other;
+}
+bool nonnegative_int::operator<=(int const &other) const {
+  return this->value_ <= other;
+}
+bool nonnegative_int::operator!=(int const &other) const {
+  return this->value_ != other;
+}
+bool nonnegative_int::operator>=(int const &other) const {
+  return this->value_ >= other;
+}
+
+bool operator<(int const &lhs, nonnegative_int const &rhs) {
+  return lhs < rhs.value_;
+}
+bool operator==(int const &lhs, nonnegative_int const &rhs) {
+  return lhs == rhs.value_;
+}
+bool operator>(int const &lhs, nonnegative_int const &rhs) {
+  return lhs > rhs.value_;
+}
+bool operator<=(int const &lhs, nonnegative_int const &rhs) {
+  return lhs <= rhs.value_;
+}
+bool operator!=(int const &lhs, nonnegative_int const &rhs) {
+  return lhs != rhs.value_;
+}
+bool operator>=(int const &lhs, nonnegative_int const &rhs) {
+  return lhs >= rhs.value_;
+}
+
+nonnegative_int nonnegative_int::operator+(nonnegative_int const &other) const {
+  return nonnegative_int{this->value_ + other.value_};
+}
+
+std::ostream &operator<<(std::ostream &os, nonnegative_int const &n) {
+  os << n.value_;
+  return os;
+}
+
+int nonnegative_int::get_value() const {
+  return this->value_;
+}
+
+int format_as(nonnegative_int const &x) {
+  return x.get_value();
+}
+} // namespace FlexFlow
+
+namespace nlohmann {
+::FlexFlow::nonnegative_int
+    adl_serializer<::FlexFlow::nonnegative_int>::from_json(json const &j) {
+  return ::FlexFlow::nonnegative_int{j.template get<int>()};
+}
+
+void adl_serializer<::FlexFlow::nonnegative_int>::to_json(
+    json &j, ::FlexFlow::nonnegative_int t) {
+  j = t.get_value();
+}
+} // namespace nlohmann
+
+namespace std {
+std::size_t hash<::FlexFlow::nonnegative_int>::operator()(
+    FlexFlow::nonnegative_int const &n) const noexcept {
+  return std::hash<int>{}(n.get_value());
+}
+} // namespace std
diff --git a/lib/utils/src/utils/stack_string.cc b/lib/utils/src/utils/stack_string.cc
new file mode 100644
index 0000000000..880df64a7c
--- /dev/null
+++ b/lib/utils/src/utils/stack_string.cc
@@ -0,0 +1,7 @@
+#include "utils/stack_string.h"
+
+namespace FlexFlow {
+
+template struct stack_basic_string<char, 5>;
+
+} // namespace FlexFlow
diff --git a/lib/utils/src/utils/stack_vector/stack_vector.cc b/lib/utils/src/utils/stack_vector/stack_vector.cc
new file mode 100644
index 0000000000..d4fb849412
--- /dev/null
+++ b/lib/utils/src/utils/stack_vector/stack_vector.cc
@@ -0,0 +1,11 @@
+#include "utils/stack_vector/stack_vector.h"
+#include "utils/archetypes/ordered_value_type.h"
+
+namespace FlexFlow {
+
+using T = ordered_value_type<0>;
+
+template struct stack_vector<T, 5>;
+template struct stack_vector<int, 5>;
+
+} // namespace FlexFlow
diff --git a/lib/utils/src/utils/stack_vector/stack_vector_of.cc b/lib/utils/src/utils/stack_vector/stack_vector_of.cc
new file mode 100644
index 0000000000..c97601646d
--- /dev/null
+++ b/lib/utils/src/utils/stack_vector/stack_vector_of.cc
@@ -0,0 +1,10 @@
+#include "utils/stack_vector/stack_vector_of.h"
+#include "utils/archetypes/ordered_value_type.h"
+
+namespace FlexFlow {
+
+using T = ordered_value_type<0>;
+
+template stack_vector<T, 5> stack_vector_of<5>(std::vector<T> const &vector);
+
+} // namespace FlexFlow
diff --git a/lib/utils/test/src/utils/containers/recurse_n.cc b/lib/utils/test/src/utils/containers/recurse_n.cc
new file mode 100644
index 0000000000..1805ee891f
--- /dev/null
+++ b/lib/utils/test/src/utils/containers/recurse_n.cc
@@ -0,0 +1,29 @@
+#include "utils/containers/recurse_n.h"
+#include <doctest/doctest.h>
+#include <string>
+
+using namespace FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("recurse_n") {
+    auto append_bar = [](std::string const &x) {
+      return x + std::string("Bar");
+    };
+
+    SUBCASE("n = 0") {
+      std::string result = recurse_n(append_bar, 0, std::string("Foo"));
+      std::string correct = "Foo";
+      CHECK(result == correct);
+    }
+
+    SUBCASE("n = 3") {
+      std::string result = recurse_n(append_bar, 3, std::string("Foo"));
+      std::string correct = "FooBarBarBar";
+      CHECK(result == correct);
+    }
+
+    SUBCASE("n < 0") {
+      CHECK_THROWS(recurse_n(append_bar, -1, std::string("Foo")));
+    }
+  }
+}
diff --git a/lib/utils/test/src/utils/nonnegative_int/nonnegative_int.cc b/lib/utils/test/src/utils/nonnegative_int/nonnegative_int.cc
new file mode 100644
index 0000000000..73d382d830
--- /dev/null
+++ b/lib/utils/test/src/utils/nonnegative_int/nonnegative_int.cc
@@ -0,0 +1,263 @@
+#include "utils/nonnegative_int/nonnegative_int.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("nonnegative_int initialization") {
+    SUBCASE("positive int initialization") {
+      CHECK_NOTHROW(nonnegative_int{1});
+    }
+
+    SUBCASE("zero initialization") {
+      CHECK_NOTHROW(nonnegative_int{0});
+    }
+
+    SUBCASE("negative int initialization") {
+      CHECK_THROWS(nonnegative_int{-1});
+    }
+  }
+
+  TEST_CASE("nonnegative_int == comparisons") {
+    nonnegative_int nn_int_1a = nonnegative_int{1};
+    nonnegative_int nn_int_1b = nonnegative_int{1};
+    nonnegative_int nn_int_2 = nonnegative_int{2};
+    SUBCASE("LHS: nonnegative_int, RHS: nonnegative_int, equal") {
+      CHECK(nn_int_1a == nn_int_1b);
+    }
+    SUBCASE("LHS: nonnegative_int, RHS: nonnegative_int, not equal") {
+      CHECK_FALSE(nn_int_1a == nn_int_2);
+    }
+    SUBCASE("LHS: nonnegative_int, RHS: int, equal") {
+      CHECK(nn_int_1a == 1);
+    }
+    SUBCASE("LHS: nonnegative_int, RHS: int, not equal") {
+      CHECK_FALSE(nn_int_1a == 2);
+    }
+    SUBCASE("LHS: int, RHS: nonnegative_int, equal") {
+      CHECK(1 == nn_int_1b);
+    }
+    SUBCASE("LHS: int, RHS: nonnegative_int, not equal") {
+      CHECK_FALSE(2 == nn_int_1b);
+    }
+  }
+
+  TEST_CASE("nonnegative_int != comparisons") {
+    nonnegative_int nn_int_1a = nonnegative_int{1};
+    nonnegative_int nn_int_1b = nonnegative_int{1};
+    nonnegative_int nn_int_2 = nonnegative_int{2};
+    SUBCASE("LHS: nonnegative_int, RHS: nonnegative_int, equal") {
+      CHECK_FALSE(nn_int_1a != nn_int_1b);
+    }
+    SUBCASE("LHS: nonnegative_int, RHS: nonnegative_int, not equal") {
+      CHECK(nn_int_1a != nn_int_2);
+    }
+    SUBCASE("LHS: nonnegative_int, RHS: int, equal") {
+      CHECK_FALSE(nn_int_1a != 1);
+    }
+    SUBCASE("LHS: nonnegative_int, RHS: int, not equal") {
+      CHECK(nn_int_1a != 2);
+    }
+    SUBCASE("LHS: int, RHS: nonnegative_int, equal") {
+      CHECK_FALSE(1 != nn_int_1b);
+    }
+    SUBCASE("LHS: int, RHS: nonnegative_int, not equal") {
+      CHECK(2 != nn_int_1b);
+    }
+  }
+
+  TEST_CASE("nonnegative_int < comparisons") {
+    nonnegative_int nn_int_1a = nonnegative_int{1};
+    nonnegative_int nn_int_1b = nonnegative_int{1};
+    nonnegative_int nn_int_2 = nonnegative_int{2};
+    SUBCASE("LHS: nonnegative_int, RHS: nonnegative_int, less than") {
+      CHECK(nn_int_1a < nn_int_2);
+    }
+    SUBCASE("LHS: nonnegative_int, RHS: nonnegative_int, equals") {
+      CHECK_FALSE(nn_int_1a < nn_int_1b);
+    }
+    SUBCASE("LHS: nonnegative_int, RHS: nonnegative_int, greater than") {
+      CHECK_FALSE(nn_int_2 < nn_int_1b);
+    }
+    SUBCASE("LHS: nonnegative_int, RHS: int, less than") {
+      CHECK(nn_int_1a < 2);
+    }
+    SUBCASE("LHS: nonnegative_int, RHS: int, equals") {
+      CHECK_FALSE(nn_int_1a < 1);
+    }
+    SUBCASE("LHS: nonnegative_int, RHS: int, greater than") {
+      CHECK_FALSE(nn_int_2 < 1);
+    }
+    SUBCASE("LHS: int, RHS: nonnegative_int, less than") {
+      CHECK(1 < nn_int_2);
+    }
+    SUBCASE("LHS: int, RHS: nonnegative_int, equals") {
+      CHECK_FALSE(1 < nn_int_1b);
+    }
+    SUBCASE("LHS: int, RHS: nonnegative_int, greater than") {
+      CHECK_FALSE(2 < nn_int_1b);
+    }
+  }
+
+  TEST_CASE("nonnegative_int <= comparisons") {
+    nonnegative_int nn_int_1a = nonnegative_int{1};
+    nonnegative_int nn_int_1b = nonnegative_int{1};
+    nonnegative_int nn_int_2 = nonnegative_int{2};
+    SUBCASE("LHS: nonnegative_int, RHS: nonnegative_int, less than") {
+      CHECK(nn_int_1a <= nn_int_2);
+    }
+    SUBCASE("LHS: nonnegative_int, RHS: nonnegative_int, equals") {
+      CHECK(nn_int_1a <= nn_int_1b);
+    }
+    SUBCASE("LHS: nonnegative_int, RHS: nonnegative_int, greater than") {
+      CHECK_FALSE(nn_int_2 <= nn_int_1b);
+    }
+    SUBCASE("LHS: nonnegative_int, RHS: int, less than") {
+      CHECK(nn_int_1a <= 2);
+    }
+    SUBCASE("LHS: nonnegative_int, RHS: int, equals") {
+      CHECK(nn_int_1a <= 1);
+    }
+    SUBCASE("LHS: nonnegative_int, RHS: int, greater than") {
+      CHECK_FALSE(nn_int_2 <= 1);
+    }
+    SUBCASE("LHS: int, RHS: nonnegative_int, less than") {
+      CHECK(1 <= nn_int_2);
+    }
+    SUBCASE("LHS: int, RHS: nonnegative_int, equals") {
+      CHECK(1 <= nn_int_1b);
+    }
+    SUBCASE("LHS: int, RHS: nonnegative_int, greater than") {
+      CHECK_FALSE(2 <= nn_int_1b);
+    }
+  }
+
+  TEST_CASE("nonnegative_int > comparisons") {
+    nonnegative_int nn_int_1a = nonnegative_int{1};
+    nonnegative_int nn_int_1b = nonnegative_int{1};
+    nonnegative_int nn_int_2 = nonnegative_int{2};
+    SUBCASE("LHS: nonnegative_int, RHS: nonnegative_int, less than") {
+      CHECK_FALSE(nn_int_1a > nn_int_2);
+    }
+    SUBCASE("LHS: nonnegative_int, RHS: nonnegative_int, equals") {
+      CHECK_FALSE(nn_int_1a > nn_int_1b);
+    }
+    SUBCASE("LHS: nonnegative_int, RHS: nonnegative_int, greater than") {
+      CHECK(nn_int_2 > nn_int_1b);
+    }
+    SUBCASE("LHS: nonnegative_int, RHS: int, less than") {
+      CHECK_FALSE(nn_int_1a > 2);
+    }
+    SUBCASE("LHS: nonnegative_int, RHS: int, equals") {
+      CHECK_FALSE(nn_int_1a > 1);
+    }
+    SUBCASE("LHS: nonnegative_int, RHS: int, greater than") {
+      CHECK(nn_int_2 > 1);
+    }
+    SUBCASE("LHS: int, RHS: nonnegative_int, less than") {
+      CHECK_FALSE(1 > nn_int_2);
+    }
+    SUBCASE("LHS: int, RHS: nonnegative_int, equals") {
+      CHECK_FALSE(1 > nn_int_1b);
+    }
+    SUBCASE("LHS: int, RHS: nonnegative_int, greater than") {
+      CHECK(2 > nn_int_1b);
+    }
+  }
+
+  TEST_CASE("nonnegative_int >= comparisons") {
+    nonnegative_int nn_int_1a = nonnegative_int{1};
+    nonnegative_int nn_int_1b = nonnegative_int{1};
+    nonnegative_int nn_int_2 = nonnegative_int{2};
+    SUBCASE("LHS: nonnegative_int, RHS: nonnegative_int, less than") {
+      CHECK_FALSE(nn_int_1a >= nn_int_2);
+    }
+    SUBCASE("LHS: nonnegative_int, RHS: nonnegative_int, equals") {
+      CHECK(nn_int_1a >= nn_int_1b);
+    }
+    SUBCASE("LHS: nonnegative_int, RHS: nonnegative_int, greater than") {
+      CHECK(nn_int_2 >= nn_int_1b);
+    }
+    SUBCASE("LHS: nonnegative_int, RHS: int, less than") {
+      CHECK_FALSE(nn_int_1a >= 2);
+    }
+    SUBCASE("LHS: nonnegative_int, RHS: int, equals") {
+      CHECK(nn_int_1a >= 1);
+    }
+    SUBCASE("LHS: nonnegative_int, RHS: int, greater than") {
+      CHECK(nn_int_2 >= 1);
+    }
+    SUBCASE("LHS: int, RHS: nonnegative_int, less than") {
+      CHECK_FALSE(1 >= nn_int_2);
+    }
+    SUBCASE("LHS: int, RHS: nonnegative_int, equals") {
+      CHECK(1 >= nn_int_1b);
+    }
+    SUBCASE("LHS: int, RHS: nonnegative_int, greater than") {
+      CHECK(2 >= nn_int_1b);
+    }
+  }
+
+  TEST_CASE("nonnegative_int + operation") {
+    nonnegative_int nn_int_1a = nonnegative_int{1};
+    nonnegative_int nn_int_1b = nonnegative_int{1};
+    nonnegative_int nn_int_2 = nonnegative_int{2};
+    SUBCASE("LHS: nonnegative_int, RHS: nonnegative_int") {
+      CHECK(nn_int_1a + nn_int_1b == nn_int_2);
+    }
+  }
+
+  TEST_CASE("adl_serializer<nonnegative_int>") {
+    SUBCASE("to_json") {
+      nonnegative_int input = nonnegative_int{5};
+
+      nlohmann::json result = input;
+      nlohmann::json correct = 5;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("from_json") {
+      nlohmann::json input = 5;
+
+      nonnegative_int result = input.template get<nonnegative_int>();
+      nonnegative_int correct = nonnegative_int{5};
+
+      CHECK(result == correct);
+    }
+  }
+
+  TEST_CASE("std::hash<nonnegative_int>") {
+    nonnegative_int nn_int_1a = nonnegative_int{1};
+    nonnegative_int nn_int_1b = nonnegative_int{1};
+    nonnegative_int nn_int_2 = nonnegative_int{2};
+    std::hash<nonnegative_int> hash_fn;
+    SUBCASE("Identical values have the same hash") {
+      CHECK(hash_fn(nn_int_1a) == hash_fn(nn_int_1b));
+    }
+    SUBCASE("Different values have different hashes") {
+      CHECK(hash_fn(nn_int_1a) != hash_fn(nn_int_2));
+    }
+    SUBCASE("Unordered set works with nonnegative_int") {
+      std::unordered_set<::FlexFlow::nonnegative_int> nonnegative_int_set;
+      nonnegative_int_set.insert(nn_int_1a);
+      nonnegative_int_set.insert(nn_int_1b);
+      nonnegative_int_set.insert(nn_int_2);
+
+      CHECK(nonnegative_int_set.size() == 2);
+    }
+  }
+
+  TEST_CASE("nonnegative int >> operator") {
+    nonnegative_int nn_int_1 = nonnegative_int{1};
+    std::ostringstream oss;
+    oss << nn_int_1;
+
+    CHECK(oss.str() == "1");
+  }
+
+  TEST_CASE("fmt::to_string(nonnegative_int)") {
+    nonnegative_int nn_int_1 = nonnegative_int{1};
+    CHECK(fmt::to_string(nn_int_1) == "1");
+  }
+}
diff --git a/lib/utils/test/src/utils/stack_vector.cc b/lib/utils/test/src/utils/stack_vector/stack_vector.cc
similarity index 98%
rename from lib/utils/test/src/utils/stack_vector.cc
rename to lib/utils/test/src/utils/stack_vector/stack_vector.cc
index 6cdd91ece1..c36de733b6 100644
--- a/lib/utils/test/src/utils/stack_vector.cc
+++ b/lib/utils/test/src/utils/stack_vector/stack_vector.cc
@@ -1,4 +1,4 @@
-#include "utils/stack_vector.h"
+#include "utils/stack_vector/stack_vector.h"
 #include "test/utils/doctest/fmt/vector.h"
 #include "test/utils/rapidcheck.h"
 #include <doctest/doctest.h>
diff --git a/lib/utils/test/src/utils/stack_vector/stack_vector_of.cc b/lib/utils/test/src/utils/stack_vector/stack_vector_of.cc
new file mode 100644
index 0000000000..2228da553a
--- /dev/null
+++ b/lib/utils/test/src/utils/stack_vector/stack_vector_of.cc
@@ -0,0 +1,16 @@
+#include "utils/stack_vector/stack_vector_of.h"
+#include "test/utils/doctest/fmt/vector.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("stack_vector_of(std::vector<T>)") {
+    std::vector<int> input = {1, 2, 3};
+    const size_t MAXSIZE = 5;
+    stack_vector<int, MAXSIZE> result = stack_vector_of<MAXSIZE>(input);
+    stack_vector<int, MAXSIZE> correct = {1, 2, 3};
+
+    CHECK(result == correct);
+  }
+}
diff --git a/machine_config_example b/machine_config_example
deleted file mode 100644
index 11b896805b..0000000000
--- a/machine_config_example
+++ /dev/null
@@ -1,49 +0,0 @@
-# This is an example of config file for the new machine model
-# comp_device:
-# Compute devices are created evenly based on the following settings.
-num_nodes = 2
-num_sockets_per_node = 2
-num_cpus_per_socket = 10
-num_gpus_per_socket = 2
-
-# mem_device:
-# Memories are created automatically. Currently, we support three kinds of memories - system memory, zero-copy memory, and GPU framebuffer memory. Each socket has one system memory (sys_mem) and one zero-copy memory (z_copy_mem); each GPU has one frame buffer memory (gpu_fb_mem).
-
-# comm_device:
-# Communication devices describe the links between the memories. Each communication device needs two parameters - latency in ms and bandwidth in GB/s. An easy way to get these numbers is using the Memspeed benchmark in legion/test/realm. Currently, we provide the following communication devices:
-# memcpy 
-membus_latency = 0.00003
-membus_bandwidth = 4.26623
-# inter-socket links
-upi_latency = 0.0004
-upi_bandwidth = 10.14039
-# inter-node links, the third argument means the number of NICs per socket (O means one NIC per node)
-nic_latency = 0.000507
-nic_bandwidth = 10.9448431
-nic_persocket = 0
-# pci-e between CPU and GPU
-pci_latency = 0.001
-pci_bandwidth = 12.578468749999999
-# nvlinks between GPUs
-nvlink_latency = 0.001
-nvlink_bandwidth = 18.52
-
-# paths:
-# This section describes the communication paths (a list of communication devices) between memories. These paths could change based on many factors, such as hardware, the version and settings of Gasnet and Legion. Please refer to the find_shortest_path function in legoin/runtime/realm/transfer/lowlevel_dma.cc to see the exact paths. 
-# Setting a path to null will ignore any cost of the communications on that path.
-intra_socket_sys_mem_to_sys_mem = membus
-inter_socket_sys_mem_to_sys_mem = upi
-inter_node_sys_mem_to_sys_mem = nic
-
-intra_socket_gpu_fb_mem_to_gpu_fb_mem = nvlink
-inter_socket_gpu_fb_mem_to_gpu_fb_mem = nvlink
-inter_node_gpu_fb_mem_to_gpu_fb_mem = pci_to_host nic pci_to_dev
-
-intra_socket_sys_mem_to_gpu_fb_mem = membus pci_to_dev
-inter_socket_sys_mem_to_gpu_fb_mem = upi pci_to_dev
-inter_node_sys_mem_to_gpu_fb_mem = nic pci_to_dev
-
-intra_socket_gpu_fb_mem_to_sys_mem = pci_to_host
-inter_socket_gpu_fb_mem_to_sys_mem = pci_to_host upi
-inter_node_gpu_fb_mem_to_sys_mem = pci_to_host nic membus
-
diff --git a/python/flexflow/torch/__init__.py b/python/flexflow/torch/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/scripts/format.sh b/scripts/format.sh
deleted file mode 100755
index e4f1ec1611..0000000000
--- a/scripts/format.sh
+++ /dev/null
@@ -1,77 +0,0 @@
-#! /usr/bin/env bash
-
-set -euo pipefail
-
-GIT_ROOT="$(git rev-parse --show-toplevel)"
-cd "$GIT_ROOT"
-
-TOOLS_PATH="$GIT_ROOT/.tools"
-RELEASE="master-f4f85437"
-CLANG_FORMAT_VERSION="16"
-CLANG_FORMAT_PATH="$TOOLS_PATH/clang-format-$CLANG_FORMAT_VERSION-$RELEASE"
-
-mkdir -p "$TOOLS_PATH"
-
-error() {
-  >&2 echo "$@"
-  exit 1
-}
-
-get_os() {
-  UNAME_OUTPUT="$(uname -s)"
-  case "$UNAME_OUTPUT" in
-    Linux*)
-      OS=Linux
-      ;;
-    Darwin*)
-      OS=Mac
-      ;;
-    *)
-      error "Unknown OS $UNAME_OUTPUT. Exiting..."
-  esac
-
-  echo "$OS"
-}
-
-download_clang_tool() {
-  TOOL="$1"
-  VERSION="$2"
-  TARGET_PATH="$3"
-
-  BASE_URL="https://github.com/muttleyxd/clang-tools-static-binaries/releases/download/$RELEASE/"
-
-  OS="$(get_os)"
-  case "$OS" in
-    Linux)
-      URL_OS="linux"
-      ;;
-    Mac)
-      URL_OS="macosx"
-      ;;
-    *)
-      error "Unknown return value from get_os: $OS. Exiting..."
-  esac
-  URL="$BASE_URL/clang-${TOOL}-${VERSION}_${URL_OS}-amd64"
-  echo "Downloading from $URL..."
-
-  if command -v wget &> /dev/null; then
-    wget "$URL" -O "$TARGET_PATH"
-  elif command -v curl &> /dev/null; then
-    curl -L "$URL" -o "$TARGET_PATH"
-  else
-    error "Could not find either wget or curl. Exiting..."
-  fi
-}
-
-if [[ ! -e $CLANG_FORMAT_PATH ]]; then
-  download_clang_tool format "$CLANG_FORMAT_VERSION" "$CLANG_FORMAT_PATH"
-  chmod u+x "$CLANG_FORMAT_PATH"
-fi
-
-CLANG_FORMAT_CONFIG="$GIT_ROOT/.clang-format-for-format-sh"
-mapfile -t FILES < <(git ls-files ':!:triton/**' '*.h' '*.cc' '*.cpp' '*.cu' '*.c' '*.decl')
-if [[ -f $CLANG_FORMAT_CONFIG ]]; then 
-  "$CLANG_FORMAT_PATH" --style=file:"$CLANG_FORMAT_CONFIG" -i "${FILES[@]}"
-else 
-  echo "error"
-fi
diff --git a/scripts/gdb/pretty_print.py b/scripts/gdb/pretty_print.py
deleted file mode 100644
index 4cccc9b76b..0000000000
--- a/scripts/gdb/pretty_print.py
+++ /dev/null
@@ -1,95 +0,0 @@
-import gdb.printing
-
-class NodePrinter:
-    def __init__(self, val):
-        self.val = val
-
-    def to_string(self):
-        ptr = self.val["ptr"]
-        if ptr != 0:
-            op_type = ptr.referenced_value()['op_type']
-            return f'Node<guid={self.val["guid"]} ptr={ptr} op_type={op_type}>'
-        else:
-            return f'Node<guid={self.val["guid"]} ptr={self.val["ptr"]}>'
-
-class EdgePrinter:
-    def __init__(self, val):
-        self.val = val
-
-    def to_string(self):
-        return f'Edge<src={self.val["srcOp"]["guid"]} dst={self.val["dstOp"]["guid"]}>'
-
-class MachineViewPrinter:
-    def __init__(self, val):
-        self.val = val
-
-    def to_string(self):
-        toks = []
-        if self.val['device_type'] == 0:
-            toks.append('type=GPU')
-        else:
-            toks.append('type=CPU')
-        start_device_id = self.val['start_device_id']
-        for i in range(self.val['ndims']):
-            dim = self.val['dim'][i]
-            stride = self.val['stride'][i]
-            toks.append(f'{i}=[{start_device_id}:{start_device_id+dim}:{stride}]')
-        return f'MachineView<{" ".join(toks)}>'
-
-class DomainPrinter:
-    def __init__(self, val):
-        self.val = val
-
-    def to_string(self):
-        toks = []
-        ndim = self.val['dim']
-        for i in range(ndim):
-            lo = self.val['rect_data'][i]
-            hi = self.val['rect_data'][i + ndim]
-            toks.append(f'{i}=[{lo}:{hi}]')
-        return f'Domain<{" ".join(toks)}>'
-
-class TensorShapePrinter:
-    def __init__(self, val):
-        self.val = val
-
-    def to_string(self):
-        toks = []
-        ndim = self.val['num_dims']
-        for i in range(ndim):
-            dim = self.val['dims'][i]
-            size = dim['size']
-            degree = dim['degree']
-            parallel_idx = dim['parallel_idx']
-            toks.append(f'{i}=[s={size} d={degree} pi={parallel_idx}]')
-        return f'TensorShape<{" ".join(toks)}>'
-
-class ParallelTensorBasePrinter:
-    def __init__(self, val):
-        self.val = val
-    
-    def to_string(self):
-        toks = []
-        toks.append(f'guid={self.val["parallel_tensor_guid"]}')
-        ndim = self.val['num_dims']
-        for i in range(ndim):
-            dim = self.val['dims'][i]
-            size = dim['size']
-            degree = dim['degree']
-            parallel_idx = dim['parallel_idx']
-            toks.append(f'{i}=[s={size} d={degree} pi={parallel_idx}]')
-        return f'ParallelTensorBase<{" ".join(toks)}>'
-
-def build_pretty_printer():
-    pp = gdb.printing.RegexpCollectionPrettyPrinter(
-        "flexflow")
-    pp.add_printer('Node', '^FlexFlow::PCG::Node$', NodePrinter)
-    pp.add_printer('Edge', '^FlexFlow::PCG::Edge$', EdgePrinter)
-    pp.add_printer('MachineView', '^FlexFlow::MachineView$', MachineViewPrinter)
-    pp.add_printer('Domain', '^Legion::Domain$', DomainPrinter)
-    pp.add_printer('ParallelTensorShape', '^FlexFlow::ParallelTensorShape$', TensorShapePrinter)
-    pp.add_printer('ParallelTensorBase', '^FlexFlow::ParallelTensorBase$', ParallelTensorBasePrinter)
-    return pp
-
-gdb.printing.register_pretty_printer(
-        gdb.current_objfile(), build_pretty_printer(), replace=True)
diff --git a/scripts/mnist_mlp_run.sh b/scripts/mnist_mlp_run.sh
deleted file mode 100755
index 8842790e6a..0000000000
--- a/scripts/mnist_mlp_run.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/bin/bash
-eval "$(conda shell.bash hook)"
-conda activate flexflow
-export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib
-~/FlexFlow/python/flexflow_python ~/FlexFlow/examples/python/native/mnist_mlp.py -ll:py 1 -ll:gpu 1 -ll:fsize 8000 -ll:zsize 8000
diff --git a/scripts/osdi22ae/bert.sh b/scripts/osdi22ae/bert.sh
deleted file mode 100755
index 18d2c3195c..0000000000
--- a/scripts/osdi22ae/bert.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-#! /usr/bin/env bash
-
-echo "Running BERT with a parallelization strategy discovered by Unity"
-"$FF_HOME"/build/examples/cpp/Transformer/transformer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 14000 -b 8 --budget 30
-
-echo "Running BERT Uno with data parallelism"
-"$FF_HOME"/build/examples/cpp/Transformer/transformer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 14000 -b 8 --budget 30 --only-data-parallel
diff --git a/scripts/osdi22ae/candle_uno.sh b/scripts/osdi22ae/candle_uno.sh
deleted file mode 100755
index 22458149f1..0000000000
--- a/scripts/osdi22ae/candle_uno.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-#! /usr/bin/env bash
-
-echo "Running CANDLE Uno with a parallelization strategy discovered by Unity"
-"$FF_HOME"/build/examples/cpp/candle_uno/candle_uno -ll:gpu 4 -ll:fsize 14000 -ll:zsize 14000 --budget 20
-
-echo "Running CANDLE Uno with data parallelism"
-"$FF_HOME"/build/examples/cpp/candle_uno/candle_uno -ll:gpu 4 -ll:fsize 14000 -ll:zsize 14000 --budget 20 --only-data-parallel
diff --git a/scripts/osdi22ae/dlrm.sh b/scripts/osdi22ae/dlrm.sh
deleted file mode 100755
index a75e78bc0a..0000000000
--- a/scripts/osdi22ae/dlrm.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-#! /usr/bin/env bash
-
-echo "Running DLRM with a parallelization strategy discovered by Unity"
-"$FF_HOME"/build/examples/cpp/DLRM/dlrm -ll:gpu 4 -ll:fsize 14000 -ll:zsize 14000 --budget 20
-
-echo "Running DLRM with data parallelism"
-"$FF_HOME"/build/examples/cpp/DLRM/dlrm -ll:gpu 4 -ll:fsize 14000 -ll:zsize 14000 --budget 20 --only-data-parallel
diff --git a/scripts/osdi22ae/inception.sh b/scripts/osdi22ae/inception.sh
deleted file mode 100755
index 7b6c079eab..0000000000
--- a/scripts/osdi22ae/inception.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-#! /usr/bin/env bash
-
-echo "Running Inception-v3 with a parallelization strategy discovered by Unity"
-"$FF_HOME"/build/examples/cpp/InceptionV3/inception -ll:gpu 4 -ll:fsize 11000 -ll:zsize 14000 -b 64 --budget 10
-
-echo "Running Inception-v3 with data parallelism"
-"$FF_HOME"/build/examples/cpp/InceptionV3/inception -ll:gpu 4 -ll:fsize 11000 -ll:zsize 14000 -b 64 --budget 10 --only-data-parallel
diff --git a/scripts/osdi22ae/mlp.sh b/scripts/osdi22ae/mlp.sh
deleted file mode 100755
index fa84607983..0000000000
--- a/scripts/osdi22ae/mlp.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-#! /usr/bin/env bash
-
-echo "Running MLP with a parallelization strategy discovered by Unity"
-"$FF_HOME"/build/examples/cpp/MLP_Unify/mlp_unify -ll:gpu 4 -ll:fsize 14000 -ll:zsize 14000 --budget 20
-
-echo "Running MLP with data parallelism"
-"$FF_HOME"/build/examples/cpp/MLP_Unify/mlp_unify -ll:gpu 4 -ll:fsize 14000 -ll:zsize 14000 --budget 20 --only-data-parallel
diff --git a/scripts/osdi22ae/resnext-50.sh b/scripts/osdi22ae/resnext-50.sh
deleted file mode 100755
index c73e079361..0000000000
--- a/scripts/osdi22ae/resnext-50.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-#! /usr/bin/env bash
-
-echo "Running ResNeXt-50 with a parallelization strategy discovered by Unity"
-"$FF_HOME"/build/examples/cpp/resnext50/resnext50 -ll:gpu 4 -ll:fsize 12000 -ll:zsize 14000 -b 16 --budget 20
-
-echo "Running ResNeXt-50 with data parallelism"
-"$FF_HOME"/build/examples/cpp/resnext50/resnext50 -ll:gpu 4 -ll:fsize 12000 -ll:zsize 14000 -b 16 --budget 20 --only-data-parallel
diff --git a/scripts/osdi22ae/xdl.sh b/scripts/osdi22ae/xdl.sh
deleted file mode 100755
index fcb5172b30..0000000000
--- a/scripts/osdi22ae/xdl.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-#! /usr/bin/env bash
-
-echo "Running XDL with a parallelization strategy discovered by Unity"
-"$FF_HOME"/build/examples/cpp/XDL/xdl -ll:gpu 4 -ll:fsize 14000 -ll:zsize 14000 --budget 20
-
-echo "Running XDL with data parallelism"
-"$FF_HOME"/build/examples/cpp/XDL/xdl -ll:gpu 4 -ll:fsize 14000 -ll:zsize 14000 --budget 20 --only-data-parallel
diff --git a/tests/align/README.md b/tests/align/README.md
deleted file mode 100644
index 1a3d9a6211..0000000000
--- a/tests/align/README.md
+++ /dev/null
@@ -1,20 +0,0 @@
-# FlexFlow-PyTorch Alignment
-
-This is an ongoing effort to align FlexFlow with PyTorch as a means to verify
-the correctness of FlexFlow. Support for additional operators will be coming
-soon, and all alignment files here are subject to change.
-## Install the Python dependencies
-install `pytest` module in flexflow environment.
-
-## Running the Alignment Tests
-Note that FlexFlow requires a CPU installation of PyTorch, so we recommend a
-separate `conda` environment for each (e.g. named `flexflow` and `pytorch`,
-respectively).
-
-Assuming those two `conda` environments, we may run
-```
-cd FlexFlow
-conda activate flexflow
-./tests/align/test_all_operators.sh
-```
-
diff --git a/tests/align/add/align_add_ff.py b/tests/align/add/align_add_ff.py
deleted file mode 100644
index d5d9c633a5..0000000000
--- a/tests/align/add/align_add_ff.py
+++ /dev/null
@@ -1,61 +0,0 @@
-import os
-import sys
-
-import torch
-from flexflow.core import *
-from flexflow.core.flexflow_cffi import Linear, Op, Parameter
-from flexflow.type import AggrMode
-
-sys.path.append("./align/")
-from align_ff_utils import (compile_ffmodel, init_ffmodel, run_fwd_bwd,
-                            save_param_ff, save_param_grad_ff, save_tensor_ff,
-                            save_tensor_grad_ff)
-from align_utils import BATCH_SIZE, gen_tensor
-
-OUT_DIR = os.path.join("align", "add", "out")
-
-
-def run():
-  INPUT_SIZE = 512
-  SEQ_LENGTH = 5
-  inp1: torch.Tensor = gen_tensor(
-      (BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE),
-      dtype="float32"
-  )
-  inp2: torch.Tensor = gen_tensor(
-      (BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE),
-      dtype="float32"
-  )
-  label: torch.Tensor = gen_tensor(
-      (BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE),
-      dtype="float32"
-  )
-
-  ffconfig = FFConfig()
-  ffmodel = FFModel(ffconfig)
-  input_tensor_1 = ffmodel.create_tensor(inp1.shape, DataType.DT_FLOAT)
-  input_tensor_2 = ffmodel.create_tensor(inp2.shape, DataType.DT_FLOAT)
-  output_tensor = ffmodel.add(
-      x=input_tensor_1,
-      y=input_tensor_2,
-      name="add"
-  )
-
-  # compile 
-  compile_ffmodel(ffmodel)
-  dls = init_ffmodel(ffmodel, ((input_tensor_1, inp1), (input_tensor_2, inp2)), label)
-  assert len(dls) == 3
-  inp1_dl, inp2_dl, label_dl = dls
-
-  # forward/backward pass
-  run_fwd_bwd(ffmodel, ffconfig, (inp1_dl, inp2_dl), label_dl)
-
-  # save data
-  save_tensor_ff(output_tensor, ffmodel, os.path.join(OUT_DIR, "ff_out.pt"))
-  save_tensor_grad_ff(output_tensor, ffmodel, os.path.join(OUT_DIR, "ff_out_grad.pt"))
-
-  
-  
-
-if __name__ == "__main__":
-    run()
diff --git a/tests/align/add/align_add_torch.py b/tests/align/add/align_add_torch.py
deleted file mode 100644
index ab1f18c799..0000000000
--- a/tests/align/add/align_add_torch.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import os
-import sys
-
-import torch
-
-sys.path.append("./align/")
-from align_utils import gen_tensor, BATCH_SIZE
-
-assert torch.cuda.is_available(), "Expects at least one GPU"
-DEVICE = torch.device(0)
-OUT_DIR = os.path.join("align", "add", "out")
-
-def run():
-  INPUT_SIZE = 512
-  SEQ_LENGTH = 5
-
-  inp1: torch.Tensor = gen_tensor(
-      (BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE),
-      dtype="float32"
-  ).to(DEVICE)
-  inp2: torch.Tensor = gen_tensor(
-      (BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE),
-      dtype="float32"
-  ).to(DEVICE)
-  label: torch.Tensor = gen_tensor(
-      (BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE),
-      dtype="float32"
-  ).to(DEVICE)
-  output = torch.add(
-      input=inp1,
-      other=inp2
-  ).to(DEVICE)
-  output.requires_grad = True
-  output.retain_grad()
-
-  loss_fn = torch.nn.MSELoss(reduction="mean")
-  loss = loss_fn(output, label)
-  loss.backward()
-  torch.save(output.cpu(), os.path.join(OUT_DIR, "torch_out.pt"))
-  torch.save(output.grad.cpu(), os.path.join(OUT_DIR, "torch_out_grad.pt"))
-
-
-if __name__ == "__main__":
-    run()
\ No newline at end of file
diff --git a/tests/align/add/gen_tensors.sh b/tests/align/add/gen_tensors.sh
deleted file mode 100755
index d5f2e4e801..0000000000
--- a/tests/align/add/gen_tensors.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#! /usr/bin/env bash
-
-eval "$(conda shell.bash hook)";
-rm align/add/out/*.pt;
-conda activate flexflow;
-./python/flexflow_python align/add/align_add_ff.py -ll:py 1 -ll:gpu 1 -ll:fsize 5000 -ll:zsize 4096 -b 16;
-conda activate pytorch;
-python align/add/align_add_torch.py;
-
diff --git a/tests/align/align_create_tensor_ff.py b/tests/align/align_create_tensor_ff.py
deleted file mode 100644
index 2dbcb942d3..0000000000
--- a/tests/align/align_create_tensor_ff.py
+++ /dev/null
@@ -1,672 +0,0 @@
-import os
-import sys
-import torch
-import argparse
-from flexflow.core import *
-from flexflow.core.flexflow_cffi import Linear, Op, Parameter
-from flexflow.type import AggrMode
-from flexflow.torch.model import GetItemNode, FunctionNode
-sys.path.append("./align/")
-
-from align_utils import parse_create_tensor_args, gen_tensor, create_general_test_tensor_torch, BATCH_SIZE, INPUT_SIZE, SEQ_LENGTH
-from align_ff_utils import (compile_ffmodel, init_ffmodel, run_fwd_bwd,
-                            save_param_ff, save_param_grad_ff, save_tensor_ff,
-                            save_tensor_grad_ff)
-
-
-# set of operaters that have weight/bias
-param_weight_op = {'conv2d': Conv2D, 'embedding': Embedding,
-                   'layernorm': LayerNorm, 'view_embedding': Embedding, 'linear': Linear}
-param_bias_op = {'conv2d': Conv2D, 'layernorm': LayerNorm, 'linear': Linear}
-
-
-def create_single_operator_ff():
-    args = parse_create_tensor_args()
-    operator_name = args.operator
-    OUT_DIR = os.path.join("tests", "align", "out", operator_name)
-
-    ffconfig = FFConfig()
-    ffmodel = FFModel(ffconfig)
-
-    if operator_name == 'add':
-        input_tensors, label, output_tensor = create_tensors_for_add_ff(
-            ffmodel)
-    elif operator_name == 'concat':
-        input_tensors, label, output_tensor = create_tensors_for_concat_ff(
-            ffmodel)
-    elif operator_name == 'conv2d':
-        input_tensors, label, output_tensor = create_tensors_for_conv2d_ff(
-            ffmodel)
-    elif operator_name == 'cos':
-        input_tensors, label, output_tensor = create_tensors_for_cos_ff(
-            ffmodel)
-    elif operator_name == 'embedding':
-        input_tensors, label, output_tensor = create_tensors_for_embedding_ff(
-            ffmodel)
-    elif operator_name == 'exp':
-        input_tensors, label, output_tensor = create_tensors_for_exp_ff(
-            ffmodel)
-    elif operator_name == 'flat':
-        input_tensors, label, output_tensor = create_tensors_for_flat_ff(
-            ffmodel)
-    elif operator_name == 'getitem':
-        input_tensors, label, output_tensor = create_tensors_for_getitem_ff(
-            ffmodel)
-    elif operator_name == 'identity':
-        input_tensors, label, output_tensor = create_tensors_for_identity_ff(
-            ffmodel)
-    elif operator_name == 'layernorm':
-        input_tensors, label, output_tensor = create_tensors_for_layernorm_ff(
-            ffmodel)
-    elif operator_name == 'linear':
-        input_tensors, label, output_tensor = create_tensors_for_linear_ff(
-            ffmodel)
-    elif operator_name == 'multiply':
-        input_tensors, label, output_tensor = create_tensors_for_multiply_ff(
-            ffmodel)
-    elif operator_name == 'pool2d':
-        input_tensors, label, output_tensor = create_tensors_for_pool2d_ff(
-            ffmodel)
-    elif operator_name == 'reducesum':
-        input_tensors, label, output_tensor = create_tensors_for_reducesum_ff(
-            ffmodel)
-    elif operator_name == 'relu':
-        input_tensors, label, output_tensor = create_tensors_for_relu_ff(
-            ffmodel)
-    elif operator_name == 'reshape':
-        input_tensors, label, output_tensor = create_tensors_for_reshape_ff(
-            ffmodel)
-    elif operator_name == 'scalar_add':
-        input_tensors, label, output_tensor = create_tensors_for_scalar_add_ff(
-            ffmodel)
-    elif operator_name == 'scalar_multiply':
-        input_tensors, label, output_tensor = create_tensors_for_scalar_multiply_ff(
-            ffmodel)
-    elif operator_name == 'scalar_sub':
-        input_tensors, label, output_tensor = create_tensors_for_scalar_sub_ff(
-            ffmodel)
-    elif operator_name == 'scalar_truediv':
-        input_tensors, label, output_tensor = create_tensors_for_scalar_truediv_ff(
-            ffmodel)
-    elif operator_name == 'sigmoid':
-        input_tensors, label, output_tensor = create_tensors_for_sigmoid_ff(
-            ffmodel)
-    elif operator_name == 'sin':
-        input_tensors, label, output_tensor = create_tensors_for_sin_ff(
-            ffmodel)
-    elif operator_name == 'subtract':
-        input_tensors, label, output_tensor = create_tensors_for_subtract_ff(
-            ffmodel)
-    elif operator_name == 'tanh':
-        input_tensors, label, output_tensor = create_tensors_for_tanh_ff(
-            ffmodel)
-    elif operator_name == 'transpose':
-        input_tensors, label, output_tensor = create_tensors_for_transpose_ff(
-            ffmodel)
-    elif operator_name == 'view_embedding':
-        input_tensors, label, output_tensor = create_tensors_for_view_embedding_ff(
-            ffmodel)
-    elif operator_name == 'max':
-            input_tensors, label, output_tensor = create_tensors_for_max_ff(
-            ffmodel)
-    elif operator_name == 'min':
-        input_tensors, label, output_tensor = create_tensors_for_min_ff(
-            ffmodel)
-    elif operator_name == 'gather':
-        input_tensors, label, output_tensor = create_tensors_for_gather_ff(
-            ffmodel)
-    else:
-        raise ValueError(
-            'Not include such Operator in Aligment Test', operator_name)
-
-    compile_ffmodel(ffmodel)
-    dataloaders = init_ffmodel(ffmodel, input_tensors, label)
-    assert len(dataloaders) == len(input_tensors) + 1
-
-    input_dataloaders = dataloaders[0: len(dataloaders) - 1]
-    label_dataloaders = dataloaders[len(dataloaders) - 1]
-    # forward/backward pass
-    run_fwd_bwd(ffmodel, ffconfig, input_dataloaders, label_dataloaders)
-    # save data
-    save_tensor_ff(output_tensor, ffmodel, os.path.join(OUT_DIR, "ff_out.pt"))
-    save_tensor_grad_ff(output_tensor, ffmodel,
-                        os.path.join(OUT_DIR, "ff_out_grad.pt"))
-
-    # save weight and bias tensor for some operators
-    layer: Op = ffmodel.get_layers()[0]
-    if operator_name == 'view_embedding':
-        layer = ffmodel.get_layers()[1]
-
-    if operator_name in param_weight_op:
-        assert isinstance(layer, param_weight_op[operator_name])
-        weight: Parameter = layer.get_weight_tensor()
-        save_param_ff(weight, ffmodel, os.path.join(OUT_DIR, "ff_weight.pt"))
-        save_param_grad_ff(weight, ffmodel, os.path.join(
-            OUT_DIR, "ff_weight_grad.pt"))
-    if operator_name in param_bias_op:
-        assert isinstance(layer, param_bias_op[operator_name])
-        bias: Parameter = layer.get_bias_tensor()
-        save_param_ff(bias, ffmodel, os.path.join(OUT_DIR, "ff_bias.pt"))
-        save_param_grad_ff(bias, ffmodel, os.path.join(
-            OUT_DIR, "ff_bias_grad.pt"))
-
-
-def create_tensors_for_add_ff(ffmodel):
-    inp1 = create_general_test_tensor_torch()
-    inp2 = create_general_test_tensor_torch()
-    label = create_general_test_tensor_torch()
-    input_tensor_1 = ffmodel.create_tensor(inp1.shape, DataType.DT_FLOAT)
-    input_tensor_2 = ffmodel.create_tensor(inp2.shape, DataType.DT_FLOAT)
-    output_tensor = ffmodel.add(
-        x=input_tensor_1,
-        y=input_tensor_2,
-        name="add"
-    )
-    return ((input_tensor_1, inp1), (input_tensor_2, inp2)), label, output_tensor
-
-
-def create_tensors_for_concat_ff(ffmodel):
-    inp1 = create_general_test_tensor_torch()
-    inp2 = create_general_test_tensor_torch()
-    inp3 = create_general_test_tensor_torch()
-    label: torch.Tensor = gen_tensor(
-        (BATCH_SIZE, SEQ_LENGTH * 3, INPUT_SIZE),
-        dtype="float32"
-    )
-
-    input_tensor_1 = ffmodel.create_tensor(inp1.shape, DataType.DT_FLOAT)
-    input_tensor_2 = ffmodel.create_tensor(inp2.shape, DataType.DT_FLOAT)
-    input_tensor_3 = ffmodel.create_tensor(inp3.shape, DataType.DT_FLOAT)
-
-    output_tensor = ffmodel.concat(
-        tensors=[input_tensor_1, input_tensor_2, input_tensor_3],
-        axis=1,
-        name="concat"
-    )
-    return ((input_tensor_1, inp1), (input_tensor_2, inp2), (input_tensor_3, inp3)), label, output_tensor
-
-
-def create_tensors_for_conv2d_ff(ffmodel):
-    KERNEL_SIZE = 3
-    INPUT_SIZE = 512
-    IN_CHANNELS = 3
-    OUTPUT_SIZE = 510
-    OUT_CHANNELS = 5
-
-    inp: torch.Tensor = gen_tensor(
-        (BATCH_SIZE, IN_CHANNELS, INPUT_SIZE, INPUT_SIZE),
-        dtype="float32"
-    )
-    label: torch.Tensor = gen_tensor(
-        (BATCH_SIZE, OUT_CHANNELS, OUTPUT_SIZE, OUTPUT_SIZE),
-        dtype="float32"
-    )
-    input_tensor = ffmodel.create_tensor(inp.shape, DataType.DT_FLOAT)
-    output_tensor = ffmodel.conv2d(
-        input=input_tensor,
-        out_channels=OUT_CHANNELS,
-        kernel_h=KERNEL_SIZE,
-        kernel_w=KERNEL_SIZE,
-        stride_h=1,
-        stride_w=1,
-        padding_h=0,
-        padding_w=0,
-        name="conv2d"
-    )
-    return ((input_tensor, inp), ), label, output_tensor
-
-
-def create_tensors_for_cos_ff(ffmodel):
-    inp = create_general_test_tensor_torch()
-    label = create_general_test_tensor_torch()
-
-    input_tensor = ffmodel.create_tensor(inp.shape, DataType.DT_FLOAT)
-    output_tensor = ffmodel.cos(
-        x=input_tensor,
-        name="cos"
-    )
-    return ((input_tensor, inp), ), label, output_tensor
-
-
-def create_tensors_for_divide_ff(ffmodel):
-    inp1 = create_general_test_tensor_torch()
-    inp2 = create_general_test_tensor_torch()
-    label = create_general_test_tensor_torch()
-    input_tensor_1 = ffmodel.create_tensor(inp1.shape, DataType.DT_FLOAT)
-    input_tensor_2 = ffmodel.create_tensor(inp2.shape, DataType.DT_FLOAT)
-    output_tensor = ffmodel.divide(
-        x=input_tensor_1,
-        y=input_tensor_2,
-        name="divide"
-    )
-    return ((input_tensor_1, inp1), (input_tensor_2, inp2)), label, output_tensor
-
-
-def create_tensors_for_embedding_ff(ffmodel):
-    NUM_EMBEDDINGS = 250112
-    EMBEDDING_DIM = 512
-    inp: torch.Tensor = gen_tensor(
-        (BATCH_SIZE, SEQ_LENGTH),
-        dtype="int64",
-        low=0,
-        high=NUM_EMBEDDINGS,
-    )
-    label: torch.Tensor = gen_tensor(
-        (BATCH_SIZE, SEQ_LENGTH, EMBEDDING_DIM),
-        dtype="float32",
-    )
-
-    input_tensor = ffmodel.create_tensor(inp.shape, DataType.DT_INT64)
-    output_tensor = ffmodel.embedding(
-        input=input_tensor,
-        num_embeddings=NUM_EMBEDDINGS,
-        embedding_dim=EMBEDDING_DIM,
-        aggr=AggrMode.AGGR_MODE_NONE,
-        kernel_initializer=NormInitializer(seed=42, mean=0, stddev=1),
-        name="embedding",
-    )
-    return ((input_tensor, inp),), label, output_tensor
-
-
-def create_tensors_for_exp_ff(ffmodel):
-    inp = create_general_test_tensor_torch()
-    label = create_general_test_tensor_torch()
-
-    input_tensor = ffmodel.create_tensor(inp.shape, DataType.DT_FLOAT)
-    output_tensor = ffmodel.exp(
-        x=input_tensor,
-        name="exp"
-    )
-    return ((input_tensor, inp),), label, output_tensor
-
-
-def create_tensors_for_flat_ff(ffmodel):
-    INPUT_SIZE_2 = 512
-
-    inp: torch.Tensor = gen_tensor(
-        (BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE, INPUT_SIZE_2),
-        dtype="float32"
-    )
-    label: torch.Tensor = gen_tensor(
-        (BATCH_SIZE, INPUT_SIZE * INPUT_SIZE_2 * SEQ_LENGTH),
-        dtype="float32"
-    )
-
-    input_tensor = ffmodel.create_tensor(inp.shape, DataType.DT_FLOAT)
-    output_tensor = ffmodel.flat(
-        input=input_tensor,
-        name="flat"
-    )
-    return ((input_tensor, inp),), label, output_tensor
-
-
-def create_tensors_for_getitem_ff(ffmodel):
-    attention_mask = gen_tensor(
-        (BATCH_SIZE, SEQ_LENGTH),
-        dtype="float32",
-        low=0,
-        high=2,
-    )
-    label: torch.Tensor = gen_tensor(
-        (BATCH_SIZE, SEQ_LENGTH),
-        dtype="float32",
-    )  # unused
-
-    attention_mask_tensor = ffmodel.create_tensor(
-        attention_mask.shape,
-        DataType.DT_FLOAT,
-    )
-    extended_attention_mask = GetItemNode.slice_tensor(
-        ffmodel,
-        attention_mask_tensor,
-        (slice(None, None, None), None, None, slice(None, None, None)),
-        "slice",
-    )
-    return ((attention_mask_tensor, attention_mask),), label, extended_attention_mask
-
-
-def create_tensors_for_identity_ff(ffmodel):
-    inp = create_general_test_tensor_torch()
-    label = create_general_test_tensor_torch()
-
-    input_tensor = ffmodel.create_tensor(inp.shape, DataType.DT_FLOAT)
-    output_tensor = ffmodel.identity(
-        input=input_tensor,
-        name="identity"
-    )
-    return ((input_tensor, inp),), label, output_tensor
-
-def create_tensors_for_layernorm_ff(ffmodel):
-    HIDDEN_SIZE = 512
-    EPS = 1e-6
-    inp: torch.Tensor = gen_tensor(
-        (BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE),
-        dtype="float32",
-    )
-    label: torch.Tensor = gen_tensor(
-        (BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE),
-        dtype="float32",
-    )
-
-    input_tensor = ffmodel.create_tensor(inp.shape, DataType.DT_FLOAT)
-    output_tensor = ffmodel.layer_norm(
-        input=input_tensor,
-        axes=[len(input_tensor.dims) - 1],  # normalize over the last dimension
-        elementwise_affine=True,
-        eps=EPS,
-        name="layernorm",
-    )
-    return ((input_tensor, inp),), label, output_tensor
-
-def create_tensors_for_linear_ff(ffmodel):
-  OUTPUT_SIZE = 128
-  inp: torch.Tensor = gen_tensor(
-      (BATCH_SIZE, INPUT_SIZE),
-      dtype="float32"
-  )
-  label: torch.Tensor = gen_tensor(
-      (BATCH_SIZE, OUTPUT_SIZE),
-      dtype="float32"
-  )
-
-  input_tensor = ffmodel.create_tensor(inp.shape, DataType.DT_FLOAT)
-  output_tensor = ffmodel.dense(
-      input=input_tensor,
-      out_dim=128,
-      name="linear"
-  )
-  return ((input_tensor, inp),), label, output_tensor
-
-
-def create_tensors_for_multiply_ff(ffmodel):
-    inp1 = create_general_test_tensor_torch()
-    inp2 = create_general_test_tensor_torch()
-    label = create_general_test_tensor_torch()
-
-    input_tensor_1 = ffmodel.create_tensor(inp1.shape, DataType.DT_FLOAT)
-    input_tensor_2 = ffmodel.create_tensor(inp2.shape, DataType.DT_FLOAT)
-    output_tensor = ffmodel.multiply(
-        x=input_tensor_1,
-        y=input_tensor_2,
-        name="multiply"
-    )
-    return ((input_tensor_1, inp1), (input_tensor_2, inp2)), label, output_tensor
-
-
-def create_tensors_for_pool2d_ff(ffmodel):
-    KERNEL_SIZE = 3
-    IN_CHANNELS = 3
-    OUTPUT_SIZE = 510
-    inp: torch.Tensor = gen_tensor(
-        (BATCH_SIZE, IN_CHANNELS, INPUT_SIZE, INPUT_SIZE),
-        dtype="float32"
-    )
-    label: torch.Tensor = gen_tensor(
-        (BATCH_SIZE, IN_CHANNELS, OUTPUT_SIZE, OUTPUT_SIZE),
-        dtype="float32"
-    )
-
-    input_tensor = ffmodel.create_tensor(inp.shape, DataType.DT_FLOAT)
-
-    output_tensor = ffmodel.pool2d(
-        input=input_tensor,
-        kernel_h=KERNEL_SIZE,
-        kernel_w=KERNEL_SIZE,
-        stride_h=1,
-        stride_w=1,
-        padding_h=0,
-        padding_w=0,
-        name="pool2d"
-    )
-    return ((input_tensor, inp),), label, output_tensor
-
-
-def create_tensors_for_reducesum_ff(ffmodel):
-    inp = create_general_test_tensor_torch()
-    label: torch.Tensor = gen_tensor(
-        (BATCH_SIZE, INPUT_SIZE),
-        dtype="float32"
-    )
-
-    input_tensor = ffmodel.create_tensor(inp.shape, DataType.DT_FLOAT)
-    output_tensor = ffmodel.reduce_sum(
-        input=input_tensor,
-        axes=(1,),
-        keepdims=False,
-        name="reducesum"
-    )
-    return ((input_tensor, inp),), label, output_tensor
-
-
-def create_tensors_for_relu_ff(ffmodel):
-    inp = create_general_test_tensor_torch()
-    label = create_general_test_tensor_torch()
-
-    input_tensor = ffmodel.create_tensor(inp.shape, DataType.DT_FLOAT)
-    output_tensor = ffmodel.relu(
-        input=input_tensor,
-        name="relu"
-    )
-    return ((input_tensor, inp),), label, output_tensor
-
-
-def create_tensors_for_reshape_ff(ffmodel):
-    inp = create_general_test_tensor_torch()
-    label: torch.Tensor = gen_tensor(
-        (BATCH_SIZE, INPUT_SIZE, SEQ_LENGTH),
-        dtype="float32"
-    )
-
-    input_tensor = ffmodel.create_tensor(inp.shape, DataType.DT_FLOAT)
-    output_tensor = ffmodel.reshape(
-        input=input_tensor,
-        shape=(BATCH_SIZE, INPUT_SIZE, SEQ_LENGTH),
-        name="reshape"
-    )
-    return ((input_tensor, inp),), label, output_tensor
-
-
-def create_tensors_for_scalar_add_ff(ffmodel):
-    inp = create_general_test_tensor_torch()
-    label = create_general_test_tensor_torch()
-
-    input_tensor = ffmodel.create_tensor(inp.shape, DataType.DT_FLOAT)
-    output_tensor = ffmodel.scalar_add(
-        input=input_tensor,
-        scalar=1,
-        inplace=False,
-        name="scalar_add"
-    )
-    return ((input_tensor, inp),), label, output_tensor
-
-
-def create_tensors_for_scalar_multiply_ff(ffmodel):
-    inp = create_general_test_tensor_torch()
-    label = create_general_test_tensor_torch()
-
-    input_tensor = ffmodel.create_tensor(inp.shape, DataType.DT_FLOAT)
-    output_tensor = ffmodel.scalar_multiply(
-        input=input_tensor,
-        scalar=2,
-        inplace=False,
-        name="scalar_multiply"
-    )
-    return ((input_tensor, inp),), label, output_tensor
-
-
-def create_tensors_for_scalar_sub_ff(ffmodel):
-    inp = create_general_test_tensor_torch()
-    label = create_general_test_tensor_torch()
-
-    input_tensor = ffmodel.create_tensor(inp.shape, DataType.DT_FLOAT)
-    output_tensor = ffmodel.scalar_sub(
-        input=input_tensor,
-        scalar=1,
-        inplace=False,
-        name="scalar_sub"
-    )
-    return ((input_tensor, inp),), label, output_tensor
-
-
-def create_tensors_for_scalar_truediv_ff(ffmodel):
-    inp = create_general_test_tensor_torch()
-    label = create_general_test_tensor_torch()
-    input_tensor = ffmodel.create_tensor(inp.shape, DataType.DT_FLOAT)
-    output_tensor = ffmodel.scalar_true_divide(
-        input=input_tensor,
-        scalar=2,
-        inplace=False,
-        name="scalar_truediv"
-    )
-    return ((input_tensor, inp),), label, output_tensor
-
-
-def create_tensors_for_sigmoid_ff(ffmodel):
-    inp = create_general_test_tensor_torch()
-    label = create_general_test_tensor_torch()
-
-    input_tensor = ffmodel.create_tensor(inp.shape, DataType.DT_FLOAT)
-    output_tensor = ffmodel.sigmoid(
-        input=input_tensor,
-        name="sigmoid"
-    )
-    return ((input_tensor, inp),), label, output_tensor
-
-
-def create_tensors_for_sin_ff(ffmodel):
-    inp = create_general_test_tensor_torch()
-    label = create_general_test_tensor_torch()
-    input_tensor = ffmodel.create_tensor(inp.shape, DataType.DT_FLOAT)
-    output_tensor = ffmodel.sin(
-        x=input_tensor,
-        name="sin"
-    )
-    return ((input_tensor, inp),), label, output_tensor
-
-
-def create_tensors_for_subtract_ff(ffmodel):
-    inp1 = create_general_test_tensor_torch()
-    inp2 = create_general_test_tensor_torch()
-    label = create_general_test_tensor_torch()
-
-    input_tensor_1 = ffmodel.create_tensor(inp1.shape, DataType.DT_FLOAT)
-    input_tensor_2 = ffmodel.create_tensor(inp2.shape, DataType.DT_FLOAT)
-    output_tensor = ffmodel.subtract(
-        x=input_tensor_1,
-        y=input_tensor_2,
-        name="subtract"
-    )
-    return ((input_tensor_1, inp1), (input_tensor_2, inp2)), label, output_tensor
-
-
-def create_tensors_for_tanh_ff(ffmodel):
-    inp = create_general_test_tensor_torch()
-    label = create_general_test_tensor_torch()
-
-    input_tensor = ffmodel.create_tensor(inp.shape, DataType.DT_FLOAT)
-    output_tensor = ffmodel.tanh(
-        input=input_tensor,
-        name="tanh"
-    )
-    return ((input_tensor, inp),), label, output_tensor
-
-
-def create_tensors_for_transpose_ff(ffmodel):
-    inp = create_general_test_tensor_torch()
-    label: torch.Tensor = gen_tensor(
-        (BATCH_SIZE, INPUT_SIZE, SEQ_LENGTH),
-        dtype="float32"
-    )
-
-    input_tensor = ffmodel.create_tensor(inp.shape, DataType.DT_FLOAT)
-    output_tensor = ffmodel.transpose(
-        input=input_tensor,
-        perm=(0, 2, 1),
-        name="sin"
-    )
-    return ((input_tensor, inp),), label, output_tensor
-
-
-def create_tensors_for_view_embedding_ff(ffmodel):
-    NUM_EMBEDDINGS = 250112
-    EMBEDDING_DIM = 512
-    inp: torch.Tensor = gen_tensor(
-        (BATCH_SIZE, SEQ_LENGTH),
-        dtype="int64",
-        low=0,
-        high=NUM_EMBEDDINGS,
-    )
-    label: torch.Tensor = gen_tensor(
-        (BATCH_SIZE, SEQ_LENGTH, EMBEDDING_DIM),
-        dtype="float32",
-    )
-
-    input_tensor = ffmodel.create_tensor(inp.shape, DataType.DT_INT64)
-    # Treat `view()` as a special case of `reshape()`
-    view_tensor = ffmodel.reshape(
-        input=input_tensor,
-        shape=FunctionNode.get_view_shape(input_tensor, (-1, inp.shape[-1])),
-        name="view",
-    )
-    output_tensor = ffmodel.embedding(
-        input=view_tensor,
-        num_embeddings=NUM_EMBEDDINGS,
-        embedding_dim=EMBEDDING_DIM,
-        aggr=AggrMode.AGGR_MODE_NONE,
-        kernel_initializer=NormInitializer(seed=42, mean=0, stddev=1),
-        name="embedding",
-    )
-    return ((input_tensor, inp),), label, output_tensor
-
-def create_tensors_for_max_ff(ffmodel):
-    inp1 = create_general_test_tensor_torch()
-    inp2 = create_general_test_tensor_torch().add(1)
-    
-    label = create_general_test_tensor_torch()
-
-    input_tensor_1 = ffmodel.create_tensor(inp1.shape, DataType.DT_FLOAT)
-    input_tensor_2 = ffmodel.create_tensor(inp2.shape, DataType.DT_FLOAT)
-    output_tensor = ffmodel.max(
-        x=input_tensor_1,
-        y=input_tensor_2,
-        name="max"
-    )
-    
-    return ((input_tensor_1, inp1),(input_tensor_2, inp2)), label, output_tensor
-
-def create_tensors_for_min_ff(ffmodel):
-    inp1 = create_general_test_tensor_torch()
-    inp2 = create_general_test_tensor_torch().add(1)
-    
-    label = create_general_test_tensor_torch()
-
-    input_tensor_1 = ffmodel.create_tensor(inp1.shape, DataType.DT_FLOAT)
-    input_tensor_2 = ffmodel.create_tensor(inp2.shape, DataType.DT_FLOAT)
-    output_tensor = ffmodel.min(
-        x=input_tensor_1,
-        y=input_tensor_2,
-        name="max"
-    )
-    return ((input_tensor_1, inp1),(input_tensor_2, inp2)), label, output_tensor
-
-def create_tensors_for_gather_ff(ffmodel):
-    inp1 = create_general_test_tensor_torch()
-    index = torch.zeros(BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE, dtype=torch.int64)
-    
-    label = create_general_test_tensor_torch()
-
-    input_tensor = ffmodel.create_tensor(inp1.shape, DataType.DT_FLOAT)
-    index_tensor = ffmodel.create_tensor(index.shape, DataType.DT_INT64)
-    output_tensor = ffmodel.gather(
-        input=input_tensor,
-        index=index_tensor,
-        dim=0,
-        name="gather"
-    )
-    return ((input_tensor, inp1),(index_tensor, index)), label, output_tensor
-
-
-
-
-if __name__ == "__main__":
-    create_single_operator_ff()
diff --git a/tests/align/align_create_tensor_torch.py b/tests/align/align_create_tensor_torch.py
deleted file mode 100644
index 8b835a5276..0000000000
--- a/tests/align/align_create_tensor_torch.py
+++ /dev/null
@@ -1,579 +0,0 @@
-import os
-import sys
-
-import torch
-
-sys.path.append("./align/")
-from align_utils import gen_tensor, parse_create_tensor_args, create_general_test_tensor_torch, BATCH_SIZE, INPUT_SIZE, SEQ_LENGTH
-
-assert torch.cuda.is_available(), "Expects at least one GPU"
-DEVICE = torch.device(0)
-param_weight_op = {'conv2d', 'embedding', 'view_embedding', 'linear'}
-param_bias_op = {'conv2d', 'linear'}
-
-
-def create_single_operator_torch():
-    args = parse_create_tensor_args()
-    operator_name = args.operator
-    OUT_DIR = os.path.join("tests", "align", "out", operator_name)
-
-    if operator_name == 'add':
-        label, output = create_tensors_for_add_torch()
-    elif operator_name == 'concat':
-        label, output = create_tensors_for_concat_torch()
-    elif operator_name == 'conv2d':
-        label, output, weight, bias = create_tensors_for_conv2d_torch(
-            param_dir=OUT_DIR)
-    elif operator_name == 'cos':
-        label, output = create_tensors_for_cos_torch()
-    elif operator_name == 'embedding':
-        label, output, weight = create_tensors_for_embedding_torch(param_dir=OUT_DIR)
-    elif operator_name == 'exp':
-        label, output = create_tensors_for_exp_torch()
-    elif operator_name == 'flat':
-        label, output = create_tensors_for_flat_torch()
-    elif operator_name == 'getitem':
-        return create_tensors_for_getitem_torch(param_dir=OUT_DIR)
-    elif operator_name == 'identity':
-        label, output = create_tensors_for_identity_torch()
-    elif operator_name == 'layernorm':
-        label, output = create_tensors_for_layernorm_torch(param_dir=OUT_DIR);
-    elif operator_name == 'linear':
-        label, output, weight, bias = create_tensors_for_linear_torch(param_dir=OUT_DIR);
-    elif operator_name == 'multiply':
-        label, output = create_tensors_for_multiply_torch()
-    elif operator_name == 'pool2d':
-        label, output = create_tensors_for_pool2d_torch()
-    elif operator_name == 'reducesum':
-        label, output = create_tensors_for_reducesum_torch()
-    elif operator_name == 'relu':
-        label, output = create_tensors_for_relu_torch()
-    elif operator_name == 'reshape':
-        label, output = create_tensors_for_reshape_torch()
-    elif operator_name == 'scalar_add':
-        label, output = create_tensors_for_scalar_add_torch()
-    elif operator_name == 'scalar_multiply':
-        label, output = create_tensors_for_scalar_multiply_torch()
-    elif operator_name == 'scalar_sub':
-        label, output = create_tensors_for_scalar_sub_torch()
-    elif operator_name == 'scalar_truediv':
-        label, output = create_tensors_for_scalar_truediv_torch()
-    elif operator_name == 'sigmoid':
-        label, output = create_tensors_for_sigmoid_torch()
-    elif operator_name == 'sin':
-        label, output = create_tensors_for_sin_torch()
-    elif operator_name == 'subtract':
-        label, output = create_tensors_for_subtract_torch()
-    elif operator_name == 'tanh':
-        label, output = create_tensors_for_tanh_torch()
-    elif operator_name == 'transpose':
-        label, output = create_tensors_for_transpose_torch()
-    elif operator_name == 'view_embedding':
-        label, output, weight = create_tensors_for_scalar_view_embedding_torch(param_dir=OUT_DIR)
-    elif operator_name == 'max':
-        label, output = create_tensors_for_max_torch()
-    elif operator_name == 'min':
-        label, output = create_tensors_for_min_torch()
-    elif operator_name == 'gather':
-        label, output = create_tensors_for_gather_torch()
-    else:
-        raise ValueError('Not Include such Operator in Aligment Test ', operator_name)
-
-    output.retain_grad()
-    loss_fn = torch.nn.MSELoss(reduction="mean")
-    loss = loss_fn(output, label)
-    loss.backward()
-
-    # save tensors to file
-    torch.save(output.cpu(), os.path.join(OUT_DIR, "torch_out.pt"))
-    torch.save(output.grad.cpu(), os.path.join(OUT_DIR, "torch_out_grad.pt"))
-
-    if (operator_name in param_weight_op):
-        torch.save(weight.grad.cpu(), os.path.join(
-            OUT_DIR, "torch_weight_grad.pt"))
-    if (operator_name in param_bias_op):
-        torch.save(bias.grad.cpu(), os.path.join(
-            OUT_DIR, "torch_bias_grad.pt"))
-
-# run tests for all operators
-
-
-def create_tensors_for_add_torch():
-    inp1 = create_general_test_tensor_torch().to(DEVICE)
-    inp2 = create_general_test_tensor_torch().to(DEVICE)
-    label = create_general_test_tensor_torch().to(DEVICE)
-    output = torch.add(
-        input=inp1,
-        other=inp2
-    ).to(DEVICE)
-    output.requires_grad = True
-    return label, output
-
-
-def create_tensors_for_concat_torch():
-    inp1 = create_general_test_tensor_torch().to(DEVICE)
-    inp2 = create_general_test_tensor_torch().to(DEVICE)
-    inp3 = create_general_test_tensor_torch().to(DEVICE)
-    label: torch.Tensor = gen_tensor(
-        (BATCH_SIZE, SEQ_LENGTH * 3, INPUT_SIZE),
-        dtype="float32"
-    ).to(DEVICE)
-    output = torch.cat(
-        tensors=(inp1, inp2, inp3),
-        dim=1
-    ).to(DEVICE)
-    output.requires_grad = True
-
-    return label, output
-
-
-def create_tensors_for_conv2d_torch(param_dir):
-    KERNEL_SIZE = 3
-    IN_CHANNELS = 3
-    OUTPUT_SIZE = 510
-    OUT_CHANNELS = 5
-    conv2d = torch.nn.Conv2d(
-        in_channels=IN_CHANNELS,
-        out_channels=OUT_CHANNELS,
-        kernel_size=KERNEL_SIZE
-    ).to(DEVICE)
-
-    linear_weight = torch.load(os.path.join(param_dir, "ff_weight.pt"))
-    linear_bias = torch.load(os.path.join(param_dir, "ff_bias.pt"))
-    assert conv2d.weight.shape == linear_weight.shape, (
-        "Shape mismatch: " f"FF={linear_weight.shape} torch={conv2d.weight.shape}"
-    )
-    assert conv2d.bias.shape == linear_bias.shape, (
-        "Shape mismatch: " f"FF={linear_bias.shape} torch={conv2d.bias.shape}"
-    )
-
-    conv2d.weight = torch.nn.Parameter(linear_weight.to(DEVICE))
-    conv2d.bias = torch.nn.Parameter(linear_bias.to(DEVICE))
-
-    # generate input/label tensors
-    # imitating 3-channel image input
-    inp: torch.Tensor = gen_tensor(
-        (BATCH_SIZE, 3, INPUT_SIZE, INPUT_SIZE),
-        dtype="float32"
-    ).to(DEVICE)
-    label: torch.Tensor = gen_tensor(
-        (BATCH_SIZE, 5, OUTPUT_SIZE, OUTPUT_SIZE),
-        dtype="float32"
-    ).to(DEVICE)
-
-    output = conv2d(inp)
-    conv2d.zero_grad()
-
-    return label, output, conv2d.weight, conv2d.bias
-
-
-def create_tensors_for_cos_torch():
-    inp = create_general_test_tensor_torch().to(DEVICE)
-    label = create_general_test_tensor_torch().to(DEVICE)
-    output = torch.cos(
-        input=inp,
-    ).to(DEVICE)
-    output.requires_grad = True
-
-    return label, output
-
-
-
-def create_tensors_for_embedding_torch(param_dir):
-    NUM_EMBEDDINGS = 250112
-    EMBEDDING_DIM = 512
-    embedding = torch.nn.Embedding(
-        num_embeddings=NUM_EMBEDDINGS,
-        embedding_dim=EMBEDDING_DIM,
-        device=DEVICE,
-    )
-    embedding_weight = torch.load(os.path.join(param_dir, "ff_weight.pt"))
-    assert embedding_weight.shape == embedding.weight.shape
-    embedding.weight = torch.nn.Parameter(embedding_weight.to(DEVICE))
-
-    inp: torch.Tensor = gen_tensor(
-        (BATCH_SIZE, SEQ_LENGTH),
-        dtype="int64",
-        low=0,
-        high=NUM_EMBEDDINGS,
-    ).to(DEVICE)
-    label: torch.Tensor = gen_tensor(
-        (BATCH_SIZE, SEQ_LENGTH, EMBEDDING_DIM),
-        dtype="float32",
-    ).to(DEVICE)
-
-    output = embedding(inp)
-    embedding.zero_grad()
-    return label, output, embedding.weight
-
-
-def create_tensors_for_exp_torch():
-    inp = create_general_test_tensor_torch().to(DEVICE)
-    label = create_general_test_tensor_torch().to(DEVICE)
-    output = torch.exp(
-        input=inp,
-    ).to(DEVICE)
-    output.requires_grad = True
-    
-    return label, output
-
-
-def create_tensors_for_flat_torch():
-    INPUT_SIZE_2 = 512
-    inp: torch.Tensor = gen_tensor(
-        (BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE, INPUT_SIZE_2),
-        dtype="float32"
-    ).to(DEVICE)
-    label: torch.Tensor = gen_tensor(
-        (BATCH_SIZE, SEQ_LENGTH * INPUT_SIZE * INPUT_SIZE_2),
-        dtype="float32"
-    ).to(DEVICE)
-
-    """todo start/end dim"""
-    output = torch.flatten(input=inp, start_dim=1, end_dim=3).to(DEVICE)
-    output.requires_grad = True
-    return label, output
-
-
-def create_tensors_for_getitem_torch(param_dir):
-    attention_mask = gen_tensor(
-        (BATCH_SIZE, SEQ_LENGTH),
-        dtype="float32",
-        low=0,
-        high=2,
-    ).to(DEVICE)
-    extended_attention_mask = attention_mask[:, None, None, :]
-    torch.save(extended_attention_mask.cpu(), os.path.join(param_dir, "torch_out.pt"))
-
-
-def create_tensors_for_identity_torch():
-    identity = torch.nn.Identity()
-
-    inp = create_general_test_tensor_torch().to(DEVICE)
-    label = create_general_test_tensor_torch().to(DEVICE)
-
-    output = identity(input=inp).to(DEVICE)
-    identity.zero_grad()
-    output.requires_grad = True
-    return label, output
-
-
-def create_tensors_for_layernorm_torch(param_dir):
-    HIDDEN_SIZE = 512
-    EPS = 1e-6
-    layernorm = torch.nn.LayerNorm(
-        normalized_shape=HIDDEN_SIZE,
-        eps=EPS,
-        elementwise_affine=True,
-    ).to(DEVICE)
-    layernorm_weight = torch.load(os.path.join(param_dir, "ff_weight.pt"))
-    layernorm_bias = torch.load(os.path.join(param_dir, "ff_bias.pt"))
-    assert layernorm.weight.shape == layernorm_weight.shape, (
-        "Shape mismatch: " f"FF={layernorm_weight.shape} torch={layernorm.weight.shape}"
-    )
-    assert layernorm.bias.shape == layernorm_bias.shape, (
-        "Shape mismatch: " f"FF={layernorm_bias.shape} torch={layernorm.bias.shape}"
-    )
-    layernorm.weight = torch.nn.Parameter(layernorm_weight.to(DEVICE))
-    layernorm.bias = torch.nn.Parameter(layernorm_bias.to(DEVICE))
-
-    inp: torch.Tensor = gen_tensor(
-        (BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE),
-        dtype="float32",
-    ).to(DEVICE)
-    label: torch.Tensor = gen_tensor(
-        (BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE),
-        dtype="float32",
-    ).to(DEVICE)
-
-    output = layernorm(inp)
-    layernorm.zero_grad()
-    return label, output
-
-
-def create_tensors_for_linear_torch(param_dir):
-    OUTPUT_SIZE = 128
-    linear = torch.nn.Linear(
-        in_features=512,
-        out_features=128
-    ).to(DEVICE)
-
-    # get weight/bias from ff files, check same shape
-    linear_weight = torch.load(os.path.join(param_dir, "ff_weight.pt"))
-    linear_bias = torch.load(os.path.join(param_dir, "ff_bias.pt"))
-    assert linear.weight.shape == linear_weight.shape, (
-        "Shape mismatch: " f"FF={linear_weight.shape} torch={linear.weight.shape}"
-    )
-    assert linear.bias.shape == linear_bias.shape, (
-        "Shape mismatch: " f"FF={linear_bias.shape} torch={linear.bias.shape}"
-    )
-
-    # set weight/bias
-    linear.weight = torch.nn.Parameter(linear_weight.to(DEVICE))
-    linear.bias = torch.nn.Parameter(linear_bias.to(DEVICE))
-
-    # generate input/label tensors w/ gen_tensor
-    inp: torch.Tensor = gen_tensor(
-        (BATCH_SIZE, INPUT_SIZE),
-        dtype="float32"
-    ).to(DEVICE)
-    
-    label: torch.Tensor = gen_tensor(
-        (BATCH_SIZE, OUTPUT_SIZE),
-        dtype="float32"
-    ).to(DEVICE)
-
-    # get output running input through layer
-    output = linear(inp)
-    linear.zero_grad()
-
-    return label, output, linear.weight, linear.bias 
-
-
-def create_tensors_for_multiply_torch():
-    inp1 = create_general_test_tensor_torch().to(DEVICE)
-    inp2 = create_general_test_tensor_torch().to(DEVICE)
-    label = create_general_test_tensor_torch().to(DEVICE)
-    output = torch.mul(
-        input=inp1,
-        other=inp2
-    ).to(DEVICE)
-    output.requires_grad = True
-    return label, output
-
-
-def create_tensors_for_pool2d_torch():
-    KERNEL_SIZE = 3
-    IN_CHANNELS = 3
-    OUTPUT_SIZE = 510
-
-    inp: torch.Tensor = gen_tensor(
-        (BATCH_SIZE, IN_CHANNELS, INPUT_SIZE, INPUT_SIZE),
-        dtype="float32"
-    ).to(DEVICE)
-    label: torch.Tensor = gen_tensor(
-        (BATCH_SIZE, IN_CHANNELS, OUTPUT_SIZE, OUTPUT_SIZE),
-        dtype="float32"
-    ).to(DEVICE)
-    pool2d = torch.nn.MaxPool2d(
-        kernel_size=KERNEL_SIZE, stride=1, padding=0).to(DEVICE)
-
-    output = pool2d(inp)
-    output.requires_grad = True
-    pool2d.zero_grad()
-    return label, output
-
-
-def create_tensors_for_reducesum_torch():
-    inp = create_general_test_tensor_torch().to(DEVICE)
-    label: torch.Tensor = gen_tensor(
-        (BATCH_SIZE, INPUT_SIZE),
-        dtype="float32"
-    ).to(DEVICE)
-
-    output = torch.sum(
-        input=inp,
-        dim=1,
-        keepdim=False
-    ).to(DEVICE)
-    output.requires_grad = True
-    return label, output
-
-
-def create_tensors_for_relu_torch():
-    relu = torch.nn.ReLU(inplace=True)
-
-    inp = create_general_test_tensor_torch().to(DEVICE)
-    label = create_general_test_tensor_torch().to(DEVICE)
-
-    output = relu(input=inp).to(DEVICE)
-    relu.zero_grad()
-    output.requires_grad = True
-    return label, output
-
-
-def create_tensors_for_reshape_torch():
-    inp = create_general_test_tensor_torch().to(DEVICE)
-    label: torch.Tensor = gen_tensor(
-        (BATCH_SIZE, INPUT_SIZE, SEQ_LENGTH),
-        dtype="float32"
-    ).to(DEVICE)
-    output = torch.reshape(
-        input=inp,
-        shape=(BATCH_SIZE, INPUT_SIZE, SEQ_LENGTH)
-    ).to(DEVICE)
-    output.requires_grad = True
-    return label, output
-
-
-def create_tensors_for_scalar_add_torch():
-    inp = create_general_test_tensor_torch().to(DEVICE)
-    label = create_general_test_tensor_torch().to(DEVICE)
-    output = torch.add(
-        input=inp,
-        other=1,
-        alpha=1
-    ).to(DEVICE)
-    output.requires_grad = True
-    return label, output
-
-
-def create_tensors_for_scalar_multiply_torch():
-    inp = create_general_test_tensor_torch().to(DEVICE)
-    label = create_general_test_tensor_torch().to(DEVICE)
-    output = torch.mul(
-        input=inp,
-        other=2
-    ).to(DEVICE)
-    output.requires_grad = True
-    return label, output
-
-
-def create_tensors_for_scalar_sub_torch():
-    inp = create_general_test_tensor_torch().to(DEVICE)
-    label = create_general_test_tensor_torch().to(DEVICE)
-    output = torch.sub(
-        input=inp,
-        other=1
-    ).to(DEVICE)
-    output.requires_grad = True
-    return label, output
-
-
-def create_tensors_for_scalar_truediv_torch():
-    inp = create_general_test_tensor_torch().to(DEVICE)
-    label = create_general_test_tensor_torch().to(DEVICE)
-    output = torch.div(
-        input=inp,
-        other=2
-    ).to(DEVICE)
-    output.requires_grad = True
-    return label, output
-
-
-def create_tensors_for_sigmoid_torch():
-    sigmoid = torch.nn.Sigmoid()
-
-    inp = create_general_test_tensor_torch().to(DEVICE)
-    label = create_general_test_tensor_torch().to(DEVICE)
-
-    output = sigmoid(input=inp).to(DEVICE)
-    sigmoid.zero_grad()
-    output.requires_grad = True
-    return label, output
-
-
-def create_tensors_for_sin_torch():
-    inp = create_general_test_tensor_torch().to(DEVICE)
-    label = create_general_test_tensor_torch().to(DEVICE)
-    output = torch.sin(
-        input=inp,
-    ).to(DEVICE)
-    output.requires_grad = True
-    return label, output
-
-
-def create_tensors_for_subtract_torch():
-    inp1 = create_general_test_tensor_torch().to(DEVICE)
-    inp2 = create_general_test_tensor_torch().to(DEVICE)
-    label = create_general_test_tensor_torch().to(DEVICE)
-    output = torch.sub(
-        input=inp1,
-        other=inp2
-    ).to(DEVICE)
-    output.requires_grad = True
-    return label, output
-
-
-def create_tensors_for_tanh_torch():
-    inp = create_general_test_tensor_torch().to(DEVICE)
-    label = create_general_test_tensor_torch().to(DEVICE)
-    output = torch.tanh(
-        input=inp,
-    ).to(DEVICE)
-    output.requires_grad = True
-    return label, output
-
-
-def create_tensors_for_transpose_torch():
-    inp = create_general_test_tensor_torch().to(DEVICE)
-    label: torch.Tensor = gen_tensor(
-        (BATCH_SIZE, INPUT_SIZE, SEQ_LENGTH),
-        dtype="float32"
-    ).to(DEVICE)
-    output = torch.transpose(
-        input=inp,
-        dim0=1,
-        dim1=2,
-    ).to(DEVICE)
-    output.requires_grad = True
-    return label, output
-
-
-def create_tensors_for_scalar_view_embedding_torch(param_dir):
-    NUM_EMBEDDINGS = 250112
-    EMBEDDING_DIM = 512
-    embedding = torch.nn.Embedding(
-        num_embeddings=NUM_EMBEDDINGS,
-        embedding_dim=EMBEDDING_DIM,
-        device=DEVICE,
-    )
-    embedding_weight = torch.load(os.path.join(param_dir, "ff_weight.pt"))
-    assert embedding_weight.shape == embedding.weight.shape, \
-        "Shape mismatch: " \
-        f"FF={embedding_weight.shape} torch={embedding.weight.shape}"
-    embedding.weight = torch.nn.Parameter(embedding_weight.to(DEVICE))
-
-    inp: torch.Tensor = gen_tensor(
-        (BATCH_SIZE, SEQ_LENGTH),
-        dtype="int64",
-        low=0,
-        high=NUM_EMBEDDINGS,
-    ).to(DEVICE)
-    label: torch.Tensor = gen_tensor(
-        (BATCH_SIZE, SEQ_LENGTH, EMBEDDING_DIM),
-        dtype="float32",
-    ).to(DEVICE)
-
-    output = embedding(inp.view(-1, inp.shape[-1]))
-    embedding.zero_grad()
-    return label, output, embedding.weight
-
-def create_tensors_for_max_torch():
-    inp = create_general_test_tensor_torch().to(DEVICE)
-    oth = create_general_test_tensor_torch().add(1).to(DEVICE)
-    label = create_general_test_tensor_torch().to(DEVICE)
-    output = torch.maximum(
-        input=inp,
-        other=oth
-    ).to(DEVICE)
-    output.requires_grad = True
-    return label, output
-   
-    
-
-def create_tensors_for_min_torch():
-    inp = create_general_test_tensor_torch().to(DEVICE)
-    oth = create_general_test_tensor_torch().add(1).to(DEVICE)
-    label = create_general_test_tensor_torch().to(DEVICE)
-    output = torch.minimum(
-        input=inp,
-        other=oth
-    ).to(DEVICE)
-    output.requires_grad = True
-    return label, output
-
-def create_tensors_for_gather_torch():
-    inp = create_general_test_tensor_torch().to(DEVICE)
-    index = torch.zeros(BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE, dtype=torch.int64).to(DEVICE)
-    label = create_general_test_tensor_torch().to(DEVICE)
-    output = torch.gather(
-        input=inp,
-        index=index,
-        dim=0
-    ).to(DEVICE)
-    output.requires_grad = True
-    return label, output
-
-if __name__ == "__main__":
-    create_single_operator_torch()
diff --git a/tests/align/align_ff_utils.py b/tests/align/align_ff_utils.py
deleted file mode 100644
index fb547e3456..0000000000
--- a/tests/align/align_ff_utils.py
+++ /dev/null
@@ -1,113 +0,0 @@
-import os
-import sys
-from typing import Iterable, Tuple
-
-import numpy as np
-import torch
-from flexflow.core import *
-from flexflow.core.flexflow_cffi import (FFConfig, FFModel, Parameter,
-                                         SingleDataLoader, Tensor)
-from flexflow.type import ParameterSyncType
-
-
-def ffmodel_barrier(ffmodel):
-    # Use `get_current_time()` as a forced synchronization barrier
-    ffmodel._ffconfig.get_current_time()
-
-
-def compile_ffmodel(ffmodel: FFModel):
-    """Compiles the FlexFlow model ``model`` using MSE loss."""
-    ffoptimizer = SGDOptimizer(ffmodel, lr=0.01)  # unused
-    ffmodel.compile(
-        optimizer=ffoptimizer,
-        loss_type=LossType.LOSS_MEAN_SQUARED_ERROR_AVG_REDUCE,
-        metrics=[MetricsType.METRICS_MEAN_SQUARED_ERROR],
-    )
-
-
-def init_ffmodel(
-    ffmodel: FFModel,
-    input_tensors: Iterable[Tuple[Tensor, torch.Tensor]],
-    label: torch.Tensor,
-) -> Tuple[SingleDataLoader, ...]:
-    """Initializes the FFModel by creating the data loaders and initializing
-    the model layers."""
-    dls = []
-    for input_tensor, inp in input_tensors:
-        dls.append(
-            ffmodel.create_data_loader(input_tensor, inp.numpy())
-        )
-    dls.append(
-        ffmodel.create_data_loader(ffmodel.label_tensor, label.numpy())
-    )
-    ffmodel.init_layers()
-    return tuple(dls)
-
-
-def run_fwd_bwd(
-    ffmodel: FFModel,
-    ffconfig: FFConfig,
-    input_dls: Iterable[SingleDataLoader],
-    label_dl: SingleDataLoader,
-    run_bwd: bool = True,
-) -> None:
-    """Runs a single forward pass and backward pass."""
-    batch_size = ffconfig.batch_size
-    dataloaders = list(input_dls) + [label_dl]
-    num_samples = label_dl.num_samples
-    ffmodel._tracing_id += 1
-    for d in dataloaders:
-        d.reset()
-    ffmodel.reset_metrics()
-    num_iters = num_samples // batch_size
-    assert num_iters == 1, "Internal error: batch size mismatch"
-    for d in dataloaders:
-        d.next_batch(ffmodel)
-    ffmodel._ffconfig.begin_trace(ffmodel._tracing_id)
-    ffmodel.forward()
-    if run_bwd:
-        ffmodel.zero_gradients()
-        ffmodel.backward()
-    ffmodel._ffconfig.end_trace(ffmodel._tracing_id)
-    ffmodel_barrier(ffmodel)
-
-
-def ensure_dir_exists(filepath: str):
-    """Ensures the directory containing ``filepath`` exists."""
-    if not os.path.exists(os.path.dirname(filepath)):
-        os.makedirs(os.path.dirname(filepath))
-
-
-def save_tensor_ff(tensor_ff: Tensor, ffmodel: FFModel, filepath: str) -> None:
-    """Saves the FlexFlow tensor ``tensor_ff`` to the filepath ``filepath``."""
-    tensor_np: np.ndarray = tensor_ff.get_model_output_tensor(ffmodel)
-    tensor_torch: torch.Tensor = torch.from_numpy(tensor_np)
-    ensure_dir_exists(filepath)
-    torch.save(tensor_torch, filepath)
-
-
-def save_tensor_grad_ff(tensor_ff: Tensor, ffmodel: FFModel, filepath: str) -> None:
-    """Saves the gradient of the FlexFlow tensor ``tensor_ff`` to the filepath
-    ``filepath``."""
-    grad_np: np.ndarray = tensor_ff.get_model_output_gradients(ffmodel, ParameterSyncType.PS)
-    grad_torch: torch.Tensor = torch.from_numpy(grad_np)
-    ensure_dir_exists(filepath)
-    torch.save(grad_torch, filepath)
-
-
-def save_param_ff(param_ff: Parameter, ffmodel: FFModel, filepath: str) -> None:
-    """Saves the FlexFlow parameter ``param_ff`` to the filepath
-    ``filepath``."""
-    param_np: np.ndarray = param_ff.get_weights(ffmodel)
-    param_torch: torch.Tensor = torch.from_numpy(param_np)
-    ensure_dir_exists(filepath)
-    torch.save(param_torch, filepath)
-
-
-def save_param_grad_ff(param_ff: Parameter, ffmodel: FFModel, filepath: str) -> None:
-    """Saves the gradient of the FlexFlow parameter ``param_ff`` to the
-    filepath ``filepath``."""
-    grad_np: np.ndarray = param_ff.get_gradients(ffmodel, ParameterSyncType.PS)
-    grad_torch: torch.Tensor = torch.from_numpy(grad_np)
-    ensure_dir_exists(filepath)
-    torch.save(grad_torch, filepath)
diff --git a/tests/align/align_test.py b/tests/align/align_test.py
deleted file mode 100644
index da7c08d962..0000000000
--- a/tests/align/align_test.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import os
-import sys
-from typing import Callable
-
-sys.path.append("./align/")
-
-from align_utils import TensorAlignmentData, align_tensors
-
-BASE_DIR = "align/"
-
-
-def prepend_dirname_fn(dirname: str) -> Callable[[str], str]:
-    def f(filename):
-        return os.path.join(dirname, filename)
-    return f
-
-
-def test_embedding():
-    out_dir = os.path.join(BASE_DIR, "embedding", "out")
-    expand = prepend_dirname_fn(out_dir)
-    align_tensors(
-        [
-            TensorAlignmentData(
-                "embedding_out",
-                expand("ff_out.pt"),
-                expand("torch_out.pt"),
-            ),
-            TensorAlignmentData(
-                "embedding_out_grad",
-                expand("ff_out_grad.pt"),
-                expand("torch_out_grad.pt"),
-            ),
-            TensorAlignmentData(
-                "embedding_weight_grad",
-                expand("ff_weight_grad.pt"),
-                expand("torch_weight_grad.pt"),
-            ),
-        ]
-    )
-
-
-def test_layernorm():
-    out_dir = os.path.join(BASE_DIR, "layernorm", "out")
-    expand = prepend_dirname_fn(out_dir)
-    align_tensors(
-        [
-            TensorAlignmentData(
-                "layernorm_out",
-                expand("ff_out.pt"),
-                expand("torch_out.pt"),
-            ),
-            TensorAlignmentData(
-                "layernorm_out_grad",
-                expand("ff_out_grad.pt"),
-                expand("torch_out_grad.pt"),
-            ),
-            TensorAlignmentData(
-                "layernorm_weight_grad",
-                expand("ff_weight_grad.pt"),
-                expand("torch_weight_grad.pt"),
-            ),
-            TensorAlignmentData(
-                "layernorm_bias_grad",
-                expand("ff_bias_grad.pt"),
-                expand("torch_bias_grad.pt")
-            )
-        ]
-    )
-
-
-def test_view_embedding():
-    out_dir = os.path.join(BASE_DIR, "view_embedding", "out")
-    expand = prepend_dirname_fn(out_dir)
-    align_tensors(
-        [
-            TensorAlignmentData(
-                "embedding_out",
-                expand("ff_out.pt"),
-                expand("torch_out.pt"),
-            ),
-            TensorAlignmentData(
-                "embedding_out_grad",
-                expand("ff_out_grad.pt"),
-                expand("torch_out_grad.pt"),
-            ),
-            TensorAlignmentData(
-                "embedding_weight_grad",
-                expand("ff_weight_grad.pt"),
-                expand("torch_weight_grad.pt"),
-            ),
-        ]
-    )
-
-
-def test_getitem():
-    out_dir = os.path.join(BASE_DIR, "getitem", "out")
-    expand = prepend_dirname_fn(out_dir)
-    align_tensors(
-        [
-            TensorAlignmentData(
-                "getitem_out",
-                expand("ff_out.pt"),
-                expand("torch_out.pt"),
-            ),
-        ]
-    )
-
-def test_linear():
-    out_dir = os.path.join(BASE_DIR, "linear", "out")
-    expand = prepend_dirname_fn(out_dir)
-    align_tensors(
-        [
-            TensorAlignmentData(
-                "linear_out",
-                expand("ff_out.pt"),
-                expand("torch_out.pt"),
-            ),
-            TensorAlignmentData(
-                "linear_out_grad",
-                expand("ff_out_grad.pt"),
-                expand("torch_out_grad.pt"),
-            ),
-            TensorAlignmentData(
-                "linear_weight_grad",
-                expand("ff_weight_grad.pt"),
-                expand("torch_weight_grad.pt"),
-            ),
-            TensorAlignmentData(
-                "linear_bias_grad",
-                expand("ff_bias_grad.pt"),
-                expand("torch_bias_grad.pt")
-            )
-          ]
-    )
-
-def test_conv2d():
-    out_dir = os.path.join(BASE_DIR, "conv2d", "out")
-    expand = prepend_dirname_fn(out_dir)
-    align_tensors(
-        [
-            TensorAlignmentData(
-                "conv2d_out",
-                expand("ff_out.pt"),
-                expand("torch_out.pt"),
-            ),
-            TensorAlignmentData(
-                "conv2d_out_grad",
-                expand("ff_out_grad.pt"),
-                expand("torch_out_grad.pt"),
-            ),
-            TensorAlignmentData(
-                "conv2d_weight_grad",
-                expand("ff_weight_grad.pt"),
-                expand("torch_weight_grad.pt"),
-            ),
-            TensorAlignmentData(
-                "conv2d_bias_grad",
-                expand("ff_bias_grad.pt"),
-                expand("torch_bias_grad.pt")
-            )
-          ]
-    )
-
-
-def test_add():
-    out_dir = os.path.join(BASE_DIR, "add", "out")
-    expand = prepend_dirname_fn(out_dir)
-    align_tensors(
-        [
-            TensorAlignmentData(
-                "add_out",
-                expand("ff_out.pt"),
-                expand("torch_out.pt"),
-            ),
-            TensorAlignmentData(
-                "add_out_grad",
-                expand("ff_out_grad.pt"),
-                expand("torch_out_grad.pt"),
-            ),
-        ]
-    )
-
-def test_subtract():
-    _test_operator('subtract')
-
-
-def test_multiply():
-    _test_operator('multiply')
-
-
-def test_pool2d():
-    _test_operator('pool2d')
-
-
-def test_reducesum():
-    _test_operator('reducesum')
-
-
-def test_reshape():
-    _test_operator('reshape')
-
-
-def test_flat():
-    _test_operator('flat')
-
-
-def test_sin():
-    _test_operator('sin')
-
-
-def test_transpose():
-    _test_operator('transpose')
-
-
-def test_exp():
-    _test_operator('exp')
-
-
-def test_cos():
-    _test_operator('cos')
-
-
-def test_scalar_add():
-    _test_operator('scalar_add')
-
-
-def test_scalar_sub():
-    _test_operator('scalar_sub')
-
-
-def test_scalar_multiply():
-    _test_operator('scalar_multiply')
-
-
-def test_scalar_truediv():
-    _test_operator('scalar_truediv')
-
-
-def test_relu():
-    _test_operator('relu')
-
-
-def test_sigmoid():
-    _test_operator('sigmoid')
-
-
-def test_tanh():
-    _test_operator('tanh')
-
-
-def test_identity():
-    _test_operator('identity')
-    
-    
-def test_linear():
-    _test_operator('linear')
-    
-    
-# def test_max():
-#     _test_operator('max')
-    
-    
-# def test_min():
-#     _test_operator('min')
-    
-def test_gather():
-    _test_operator('gather')
-
-
-def _test_operator(operater_name):
-    out_dir = os.path.join(BASE_DIR, operater_name)
-    expand = prepend_dirname_fn(out_dir)
-    align_tensors(
-        [
-            TensorAlignmentData(
-                "subtract_out",
-                expand("ff_out.pt"),
-                expand("torch_out.pt"),
-            ),
-            TensorAlignmentData(
-                "subtract_out_grad",
-                expand("ff_out_grad.pt"),
-                expand("torch_out_grad.pt"),
-            ),
-        ]
-    )
-
-def test_multiply():
-    out_dir = os.path.join(BASE_DIR, "multiply", "out")
-    expand = prepend_dirname_fn(out_dir)
-    align_tensors(
-        [
-            TensorAlignmentData(
-                "multiply_out",
-                expand("ff_out.pt"),
-                expand("torch_out.pt"),
-            ),
-            TensorAlignmentData(
-                "multiply_out_grad",
-                expand("ff_out_grad.pt"),
-                expand("torch_out_grad.pt"),
-            ),
-        ]
-    )
\ No newline at end of file
diff --git a/tests/align/align_utils.py b/tests/align/align_utils.py
deleted file mode 100644
index 34f07a4928..0000000000
--- a/tests/align/align_utils.py
+++ /dev/null
@@ -1,127 +0,0 @@
-from typing import Any, Dict, Tuple
-
-import numpy as np
-import torch
-import os
-
-from typing import Iterable, NamedTuple
-from argparse import ArgumentParser
-
-BATCH_SIZE = 16
-INPUT_SIZE = 512
-SEQ_LENGTH = 5
-
-def make_deterministic(seed: int = 42) -> None:
-    """Makes ensuing runs determinstic by setting seeds and using deterministic
-    backends."""
-    torch.manual_seed(seed)
-    np.random.seed(seed)
-    torch.backends.cudnn.deterministic = True
-
-
-def str_dtype_to_torch_dtype(dtype: str) -> torch.dtype:
-    """Converts a string representation of a dtype to the corresponding
-    PyTorch dtype."""
-    if dtype == "int32":
-        return torch.int32
-    elif dtype == "int64":
-        return torch.int64
-    elif dtype == "float32":
-        return torch.float32
-    elif dtype == "float64":
-        return torch.float64
-    else:
-        raise ValueError(f"Unsupported dtype: {dtype}")
-
-
-def gen_tensor(
-    shape: Tuple[int, ...],
-    dtype: str = "float32",
-    **kwargs: Dict[str, Any],
-) -> torch.Tensor:
-    """
-    Generates a random tensor on host with the given shape. If ``dtype`` names
-    an integer type, then the tensor elements are chosen uniform randomly from
-    the range [-5, 5] by default or a user-specified range (taken from
-    ``kwargs``). If ``dtype`` names a floating-point type, then the tensor
-    elements are chosen from the standard normal distribution.
-
-    Arguments:
-        shape (Tuple[int, ...]): Shape of the tensor to generate.
-        dtype (Union[str, torch.dtype, np.dtype]): Data type of the tensor to
-            generate. (Default: "float32")
-        kwargs (Dict[str, Any]): Keyword arguments to forward to the
-            constructor---namely, if ``dtype`` names an integer type, then
-            ``kwargs`` can specify ``low`` and ``high`` to define the range
-            [``low``, ``high``) from which the random tensor elements are
-            generated.
-    """
-    make_deterministic()
-    is_integer_dtype = str(dtype).find("int") >= 0
-    if is_integer_dtype:
-        low = kwargs.get("low", -5)
-        high = kwargs.get("high", 6)
-        assert high > low, f"Invalid range: [{low}, {high})"
-        np_array = np.random.randint(
-            low=low,
-            high=high,
-            size=shape,
-            dtype=dtype,
-        )
-    else:
-        np_array = np.random.randn(*shape)
-    return torch.from_numpy(np_array).to(str_dtype_to_torch_dtype(dtype))
-
-
-class TensorAlignmentData(NamedTuple):
-    """
-    This contains the data for aligning FlexFlow and PyTorch on a tensor
-    quantity. It includes a pair of filepaths (``ff_filepath`` and
-    ``torch_filepath``) to PyTorch tensors (saved as ``.pt`` files)
-    representing the FlexFlow and PyTorch versions of the tensor quantity
-    given by ``tensor_name``.
-    """
-    tensor_name: str
-    ff_filepath: str
-    torch_filepath: str
-
-
-def align_tensors(tensor_alignment_data_iter: Iterable[TensorAlignmentData]):
-    """
-    Checks the alignment between tensors specified by
-    ``tensor_alignment_data_iter``. Each element in the iterable specifies a
-    single tensor quantity to align between FlexFlow and PyTorch.
-    """
-    for tensor_alignment_data in tensor_alignment_data_iter:
-        ff_filepath = tensor_alignment_data.ff_filepath
-        torch_filepath = tensor_alignment_data.torch_filepath
-        assert os.path.exists(ff_filepath), \
-            f"Missing FlexFlow tensor at {ff_filepath}"
-        assert os.path.exists(torch_filepath), \
-            f"Missing PyTorch tensor at {torch_filepath}"
-        ff_tensor = torch.load(ff_filepath).cpu()
-        torch_tensor = torch.load(torch_filepath).cpu()
-        print(f"Checking {tensor_alignment_data.tensor_name} alignment...")
-        torch.testing.assert_close(ff_tensor, torch_tensor)
-
-
-def parse_create_tensor_args():
-    """
-    get operator name from command line for creating tensors
-    """
-    parser = ArgumentParser(description='Pytorch Aligment Test Suite')
-    parser.add_argument("-o", "--operator", dest="operator",
-                        required=False, metavar="", help="operator needs to be test")
-
-    args, unknown = parser.parse_known_args()
-    return args
-
-def create_general_test_tensor_torch() -> torch.Tensor:
-    """
-    generate general input size of alignment tests
-    """
-    tensor: torch.Tensor = gen_tensor(
-        (BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE),
-        dtype="float32"
-    )
-    return tensor
\ No newline at end of file
diff --git a/tests/align/conv2d/align_conv2d_ff.py b/tests/align/conv2d/align_conv2d_ff.py
deleted file mode 100644
index 5ab92070f9..0000000000
--- a/tests/align/conv2d/align_conv2d_ff.py
+++ /dev/null
@@ -1,73 +0,0 @@
-import os
-import sys
-
-import torch
-from flexflow.core import *
-from flexflow.core.flexflow_cffi import Linear, Op, Parameter
-from flexflow.type import AggrMode
-
-sys.path.append("./align/")
-from align_ff_utils import (compile_ffmodel, init_ffmodel, run_fwd_bwd,
-                            save_param_ff, save_param_grad_ff, save_tensor_ff,
-                            save_tensor_grad_ff)
-from align_utils import BATCH_SIZE, gen_tensor
-
-OUT_DIR = os.path.join("align", "conv2d", "out")
-
-
-def run():
-  KERNEL_SIZE = 3
-  INPUT_SIZE = 512
-  IN_CHANNELS = 3
-  OUTPUT_SIZE = 510
-  OUT_CHANNELS = 5
-  inp: torch.Tensor = gen_tensor(
-      (BATCH_SIZE, IN_CHANNELS, INPUT_SIZE, INPUT_SIZE),
-      dtype="float32"
-  )
-  label: torch.Tensor = gen_tensor(
-      (BATCH_SIZE, OUT_CHANNELS, OUTPUT_SIZE, OUTPUT_SIZE),
-      dtype="float32"
-  )
-
-  ffconfig = FFConfig()
-  ffmodel = FFModel(ffconfig)
-  input_tensor = ffmodel.create_tensor(inp.shape, DataType.DT_FLOAT)
-  output_tensor = ffmodel.conv2d(
-      input=input_tensor,
-      out_channels=OUT_CHANNELS,
-      kernel_h=KERNEL_SIZE, 
-      kernel_w=KERNEL_SIZE, 
-      stride_h=1, 
-      stride_w=1, 
-      padding_h=0, 
-      padding_w=0,
-      name="conv2d"
-  )
-
-  # compile model 
-  compile_ffmodel(ffmodel)
-  dls = init_ffmodel(ffmodel, ((input_tensor, inp),), label)
-  assert len(dls) == 2
-  inp_dl, label_dl = dls
-
-  # forward/back pass
-  run_fwd_bwd(ffmodel, ffconfig, (inp_dl,), label_dl)
-
-  conv2d_layer: Op = ffmodel.get_layers()[0]
-  assert isinstance(conv2d_layer, Conv2D)
-  conv2d_weight: Parameter = conv2d_layer.get_weight_tensor()
-  conv2d_bias: Parameter = conv2d_layer.get_bias_tensor()
-
-  # save output data
-  save_tensor_ff(output_tensor, ffmodel, os.path.join(OUT_DIR, "ff_out.pt"))
-  save_tensor_grad_ff(output_tensor, ffmodel, os.path.join(OUT_DIR, "ff_out_grad.pt"))
-  
-  # save layer data
-  save_param_ff(conv2d_weight, ffmodel, os.path.join(OUT_DIR, "ff_weight.pt"))
-  save_param_ff(conv2d_bias, ffmodel, os.path.join(OUT_DIR, "ff_bias.pt"))
-  save_param_grad_ff(conv2d_weight, ffmodel, os.path.join(OUT_DIR, "ff_weight_grad.pt"))
-  save_param_grad_ff(conv2d_bias, ffmodel, os.path.join(OUT_DIR, "ff_bias_grad.pt"))
-
-if __name__ == "__main__":
-    run()
diff --git a/tests/align/conv2d/align_conv2d_torch.py b/tests/align/conv2d/align_conv2d_torch.py
deleted file mode 100644
index 7ba296a7d3..0000000000
--- a/tests/align/conv2d/align_conv2d_torch.py
+++ /dev/null
@@ -1,61 +0,0 @@
-import os
-import sys
-
-import torch
-
-sys.path.append("./align/")
-from align_utils import gen_tensor, BATCH_SIZE
-
-assert torch.cuda.is_available(), "Expects at least one GPU"
-DEVICE = torch.device(0)
-OUT_DIR = os.path.join("align", "conv2d", "out")
-
-def run():
-  KERNEL_SIZE = 3
-  INPUT_SIZE = 512
-  IN_CHANNELS = 3
-  OUTPUT_SIZE = 510
-  OUT_CHANNELS = 5
-  conv2d = torch.nn.Conv2d(
-      in_channels=IN_CHANNELS,
-      out_channels=OUT_CHANNELS,
-      kernel_size=KERNEL_SIZE
-  ).to(DEVICE)
-
-  linear_weight = torch.load(os.path.join(OUT_DIR, "ff_weight.pt"))
-  linear_bias = torch.load(os.path.join(OUT_DIR, "ff_bias.pt"))
-  assert conv2d.weight.shape == linear_weight.shape, (
-      "Shape mismatch: " f"FF={linear_weight.shape} torch={conv2d.weight.shape}"
-  )
-  assert conv2d.bias.shape == linear_bias.shape, (
-      "Shape mismatch: " f"FF={linear_bias.shape} torch={conv2d.bias.shape}"
-  )
-
-  conv2d.weight = torch.nn.Parameter(linear_weight.to(DEVICE))
-  conv2d.bias = torch.nn.Parameter(linear_bias.to(DEVICE))
-
-  # generate input/label tensors
-  # imitating 3-channel image input
-  inp: torch.Tensor = gen_tensor(
-      (BATCH_SIZE, 3, INPUT_SIZE, INPUT_SIZE),
-      dtype="float32"
-  ).to(DEVICE)
-  label: torch.Tensor = gen_tensor(
-      (BATCH_SIZE, 5, OUTPUT_SIZE, OUTPUT_SIZE),
-      dtype="float32"
-  ).to(DEVICE)
-
-  output = conv2d(inp)
-  conv2d.zero_grad()
-  output.retain_grad()
-  loss_fn = torch.nn.MSELoss(reduction="mean")
-  loss = loss_fn(output, label)
-  loss.backward()
-  torch.save(output.cpu(), os.path.join(OUT_DIR, "torch_out.pt"))
-  torch.save(output.grad.cpu(), os.path.join(OUT_DIR, "torch_out_grad.pt"))
-  torch.save(conv2d.weight.grad.cpu(), os.path.join(OUT_DIR, "torch_weight_grad.pt"))
-  torch.save(conv2d.bias.grad.cpu(), os.path.join(OUT_DIR, "torch_bias_grad.pt"))
-
-
-if __name__ == "__main__":
-    run()
\ No newline at end of file
diff --git a/tests/align/conv2d/gen_tensors.sh b/tests/align/conv2d/gen_tensors.sh
deleted file mode 100755
index f7582c1057..0000000000
--- a/tests/align/conv2d/gen_tensors.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#! /usr/bin/env bash
-
-eval "$(conda shell.bash hook)";
-rm align/conv2d/out/*.pt;
-conda activate flexflow;
-./python/flexflow_python align/conv2d/align_conv2d_ff.py -ll:py 1 -ll:gpu 1 -ll:fsize 5000 -ll:zsize 4096 -b 16;
-conda activate pytorch;
-python align/conv2d/align_conv2d_torch.py;
-
diff --git a/tests/align/embedding/align_embedding_ff.py b/tests/align/embedding/align_embedding_ff.py
deleted file mode 100644
index 14db9bec1a..0000000000
--- a/tests/align/embedding/align_embedding_ff.py
+++ /dev/null
@@ -1,62 +0,0 @@
-import os
-import sys
-
-import torch
-from flexflow.core import *
-from flexflow.core.flexflow_cffi import Embedding, Op, Parameter
-from flexflow.type import AggrMode
-
-sys.path.append("./align/")
-from align_ff_utils import (compile_ffmodel, init_ffmodel, run_fwd_bwd,
-                            save_param_ff, save_param_grad_ff, save_tensor_ff,
-                            save_tensor_grad_ff)
-from align_utils import BATCH_SIZE, gen_tensor
-
-SEQ_LENGTH = 5
-OUT_DIR = os.path.join("align", "embedding", "out")
-
-
-def run():
-    NUM_EMBEDDINGS = 250112
-    EMBEDDING_DIM = 512
-    inp: torch.Tensor = gen_tensor(
-        (BATCH_SIZE, SEQ_LENGTH),
-        dtype="int64",
-        low=0,
-        high=NUM_EMBEDDINGS,
-    )
-    label: torch.Tensor = gen_tensor(
-        (BATCH_SIZE, SEQ_LENGTH, EMBEDDING_DIM),
-        dtype="float32",
-    )
-
-    ffconfig = FFConfig()
-    ffmodel = FFModel(ffconfig)
-    input_tensor = ffmodel.create_tensor(inp.shape, DataType.DT_INT64)
-    output_tensor = ffmodel.embedding(
-        input=input_tensor,
-        num_embeddings=NUM_EMBEDDINGS,
-        embedding_dim=EMBEDDING_DIM,
-        aggr=AggrMode.AGGR_MODE_NONE,
-        kernel_initializer=NormInitializer(seed=42, mean=0, stddev=1),
-        name="embedding",
-    )
-    compile_ffmodel(ffmodel)
-    dls = init_ffmodel(ffmodel, ((input_tensor, inp),), label)
-    assert len(dls) == 2
-    inp_dl, label_dl = dls
-    run_fwd_bwd(ffmodel, ffconfig, (inp_dl,), label_dl)
-
-    embedding_layer: Op = ffmodel.get_layers()[0]
-    assert isinstance(embedding_layer, Embedding)
-    embedding_weight: Parameter = embedding_layer.get_weight_tensor()
-    save_tensor_ff(output_tensor, ffmodel, os.path.join(OUT_DIR, "ff_out.pt"))
-    save_tensor_grad_ff(output_tensor, ffmodel, os.path.join(OUT_DIR, "ff_out_grad.pt"))
-    save_param_ff(embedding_weight, ffmodel, os.path.join(OUT_DIR, "ff_weight.pt"))
-    save_param_grad_ff(
-        embedding_weight, ffmodel, os.path.join(OUT_DIR, "ff_weight_grad.pt")
-    )
-
-
-if __name__ == "__main__":
-    run()
diff --git a/tests/align/embedding/align_embedding_torch.py b/tests/align/embedding/align_embedding_torch.py
deleted file mode 100644
index b265088713..0000000000
--- a/tests/align/embedding/align_embedding_torch.py
+++ /dev/null
@@ -1,52 +0,0 @@
-import os
-import sys
-
-import torch
-
-sys.path.append("./align/")
-from align_utils import gen_tensor, BATCH_SIZE
-
-assert torch.cuda.is_available(), "Expects at least one GPU"
-DEVICE = torch.device(0)
-SEQ_LENGTH = 5
-OUT_DIR = os.path.join("align", "embedding", "out")
-
-
-def run():
-    NUM_EMBEDDINGS = 250112
-    EMBEDDING_DIM = 512
-    embedding = torch.nn.Embedding(
-        num_embeddings=NUM_EMBEDDINGS,
-        embedding_dim=EMBEDDING_DIM,
-        device=DEVICE,
-    )
-    embedding_weight = torch.load(os.path.join(OUT_DIR, "ff_weight.pt"))
-    assert embedding_weight.shape == embedding.weight.shape
-    embedding.weight = torch.nn.Parameter(embedding_weight.to(DEVICE))
-
-    inp: torch.Tensor = gen_tensor(
-        (BATCH_SIZE, SEQ_LENGTH),
-        dtype="int64",
-        low=0,
-        high=NUM_EMBEDDINGS,
-    ).to(DEVICE)
-    label: torch.Tensor = gen_tensor(
-        (BATCH_SIZE, SEQ_LENGTH, EMBEDDING_DIM),
-        dtype="float32",
-    ).to(DEVICE)
-
-    output = embedding(inp)
-    embedding.zero_grad()
-    output.retain_grad()
-    loss_fn = torch.nn.MSELoss(reduction="mean")
-    loss = loss_fn(output, label)
-    loss.backward()
-    torch.save(output.cpu(), os.path.join(OUT_DIR, "torch_out.pt"))
-    torch.save(output.grad.cpu(), os.path.join(OUT_DIR, "torch_out_grad.pt"))
-    torch.save(
-        embedding.weight.grad.cpu(), os.path.join(OUT_DIR, "torch_weight_grad.pt")
-    )
-
-
-if __name__ == "__main__":
-    run()
diff --git a/tests/align/embedding/gen_tensors.sh b/tests/align/embedding/gen_tensors.sh
deleted file mode 100755
index 716474f302..0000000000
--- a/tests/align/embedding/gen_tensors.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#! /usr/bin/env bash
-
-eval "$(conda shell.bash hook)";
-rm align/embedding/out/*.pt;
-conda activate flexflow;
-./python/flexflow_python align/embedding/align_embedding_ff.py -ll:py 1 -ll:gpu 1 -ll:fsize 5000 -ll:zsize 4096 -b 16;
-conda activate pytorch;
-python align/embedding/align_embedding_torch.py;
diff --git a/tests/align/getitem/align_getitem_ff.py b/tests/align/getitem/align_getitem_ff.py
deleted file mode 100644
index dbb5f54df3..0000000000
--- a/tests/align/getitem/align_getitem_ff.py
+++ /dev/null
@@ -1,54 +0,0 @@
-import os
-import sys
-
-import torch
-from flexflow.core import *
-from flexflow.torch.model import GetItemNode
-
-sys.path.append("./align/")
-from align_ff_utils import compile_ffmodel, init_ffmodel, run_fwd_bwd, save_tensor_ff
-from align_utils import gen_tensor, BATCH_SIZE
-
-SEQ_LENGTH = 5
-OUT_DIR = os.path.join("align", "getitem", "out")
-
-
-def run():
-    """Checks the ``getitem()`` code path for tensor slicing."""
-    attention_mask = gen_tensor(
-        (BATCH_SIZE, SEQ_LENGTH),
-        dtype="int64",
-        low=0,
-        high=2,
-    )
-    label: torch.Tensor = gen_tensor(
-        (BATCH_SIZE, SEQ_LENGTH),
-        dtype="float32",
-    )  # unused
-
-    ffconfig = FFConfig()
-    ffmodel = FFModel(ffconfig)
-    attention_mask_tensor = ffmodel.create_tensor(
-        attention_mask.shape,
-        DataType.DT_INT64,
-    )
-    extended_attention_mask = GetItemNode.slice_tensor(
-        ffmodel,
-        attention_mask_tensor,
-        (slice(None, None, None), None, None, slice(None, None, None)),
-        "slice",
-    )
-
-    compile_ffmodel(ffmodel)
-    dls = init_ffmodel(
-        ffmodel, ((attention_mask_tensor, attention_mask),), label,
-    )
-    assert len(dls) == 2
-    inp_dl, label_dl = dls
-    run_fwd_bwd(ffmodel, ffconfig, (inp_dl,), label_dl, run_bwd=False)
-
-    save_tensor_ff(extended_attention_mask, ffmodel, os.path.join(OUT_DIR, "ff_out.pt"))
-
-
-if __name__ == "__main__":
-    run()
diff --git a/tests/align/getitem/align_getitem_torch.py b/tests/align/getitem/align_getitem_torch.py
deleted file mode 100644
index 283d87d754..0000000000
--- a/tests/align/getitem/align_getitem_torch.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import os
-import sys
-
-import torch
-
-sys.path.append("./align/")
-from align_utils import gen_tensor, BATCH_SIZE
-
-assert torch.cuda.is_available(), "Expects at least one GPU"
-DEVICE = torch.device(0)
-SEQ_LENGTH = 5
-OUT_DIR = os.path.join("align", "getitem", "out")
-
-
-def run():
-    """Checks the ``getitem()`` code path for tensor slicing."""
-    attention_mask = gen_tensor(
-        (BATCH_SIZE, SEQ_LENGTH),
-        dtype="int64",
-        low=0,
-        high=2,
-    ).to(DEVICE)
-    # Extend to shape (BATCH_SIZE, 1, 1, SEQ_LENGTH)
-    extended_attention_mask = attention_mask[:, None, None, :]
-    torch.save(extended_attention_mask.cpu(), os.path.join(OUT_DIR, "torch_out.pt"))
-
-
-if __name__ == "__main__":
-    run()
diff --git a/tests/align/getitem/gen_tensors.sh b/tests/align/getitem/gen_tensors.sh
deleted file mode 100755
index 0f12a0d76e..0000000000
--- a/tests/align/getitem/gen_tensors.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#! /usr/bin/env bash
-
-eval "$(conda shell.bash hook)";
-rm align/getitem/out/*.pt;
-conda activate flexflow;
-./python/flexflow_python align/getitem/align_getitem_ff.py -ll:py 1 -ll:gpu 1 -ll:fsize 5000 -ll:zsize 4096 -b 16;
-conda activate pytorch;
-python align/getitem/align_getitem_torch.py;
diff --git a/tests/align/layernorm/align_layernorm_ff.py b/tests/align/layernorm/align_layernorm_ff.py
deleted file mode 100644
index 534a0a6225..0000000000
--- a/tests/align/layernorm/align_layernorm_ff.py
+++ /dev/null
@@ -1,60 +0,0 @@
-import os
-import sys
-
-import torch
-from flexflow.core import *
-from flexflow.core.flexflow_cffi import LayerNorm, Op, Parameter
-
-sys.path.append("./align/")
-from align_ff_utils import (compile_ffmodel, init_ffmodel, run_fwd_bwd,
-                            save_param_ff, save_param_grad_ff, save_tensor_ff,
-                            save_tensor_grad_ff)
-from align_utils import gen_tensor, BATCH_SIZE
-
-SEQ_LENGTH = 5
-OUT_DIR = os.path.join("align", "layernorm", "out")
-
-
-def run():
-    HIDDEN_SIZE = 512
-    EPS = 1e-6
-    inp: torch.Tensor = gen_tensor(
-        (BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE),
-        dtype="float32",
-    )
-    label: torch.Tensor = gen_tensor(
-        (BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE),
-        dtype="float32",
-    )
-
-    ffconfig = FFConfig()
-    ffmodel = FFModel(ffconfig)
-    input_tensor = ffmodel.create_tensor(inp.shape, DataType.DT_FLOAT)
-    output_tensor = ffmodel.layer_norm(
-        input=input_tensor,
-        axes=[len(input_tensor.dims) - 1],  # normalize over the last dimension
-        elementwise_affine=True,
-        eps=EPS,
-        name="layernorm",
-    )
-
-    compile_ffmodel(ffmodel)
-    dls = init_ffmodel(ffmodel, ((input_tensor, inp),), label)
-    assert len(dls) == 2
-    inp_dl, label_dl = dls
-    run_fwd_bwd(ffmodel, ffconfig, (inp_dl,), label_dl)
-
-    layernorm_layer: Op = ffmodel.get_layers()[0]
-    assert isinstance(layernorm_layer, LayerNorm)
-    layernorm_weight: Parameter = layernorm_layer.get_weight_tensor()
-    layernorm_bias: Parameter = layernorm_layer.get_bias_tensor()
-    save_tensor_ff(output_tensor, ffmodel, os.path.join(OUT_DIR, "ff_out.pt"))
-    save_tensor_grad_ff(output_tensor, ffmodel, os.path.join(OUT_DIR, "ff_out_grad.pt"))
-    save_param_ff(layernorm_weight, ffmodel, os.path.join(OUT_DIR, "ff_weight.pt"))
-    save_param_ff(layernorm_bias, ffmodel, os.path.join(OUT_DIR, "ff_bias.pt"))
-    save_param_grad_ff(layernorm_weight, ffmodel, os.path.join(OUT_DIR, "ff_weight_grad.pt"))
-    save_param_grad_ff(layernorm_bias, ffmodel, os.path.join(OUT_DIR, "ff_bias_grad.pt"))
-
-
-if __name__ == "__main__":
-    run()
diff --git a/tests/align/layernorm/align_layernorm_torch.py b/tests/align/layernorm/align_layernorm_torch.py
deleted file mode 100644
index b49a8aca74..0000000000
--- a/tests/align/layernorm/align_layernorm_torch.py
+++ /dev/null
@@ -1,56 +0,0 @@
-import os
-import sys
-
-import torch
-
-sys.path.append("./align/")
-from align_utils import gen_tensor, BATCH_SIZE
-
-assert torch.cuda.is_available(), "Expects at least one GPU"
-DEVICE = torch.device(0)
-SEQ_LENGTH = 5
-OUT_DIR = os.path.join("align", "layernorm", "out")
-
-
-def run():
-    HIDDEN_SIZE = 512
-    EPS = 1e-6
-    layernorm = torch.nn.LayerNorm(
-        normalized_shape=HIDDEN_SIZE,
-        eps=EPS,
-        elementwise_affine=True,
-    ).to(DEVICE)
-    layernorm_weight = torch.load(os.path.join(OUT_DIR, "ff_weight.pt"))
-    layernorm_bias = torch.load(os.path.join(OUT_DIR, "ff_bias.pt"))
-    assert layernorm.weight.shape == layernorm_weight.shape, (
-        "Shape mismatch: " f"FF={layernorm_weight.shape} torch={layernorm.weight.shape}"
-    )
-    assert layernorm.bias.shape == layernorm_bias.shape, (
-        "Shape mismatch: " f"FF={layernorm_bias.shape} torch={layernorm.bias.shape}"
-    )
-    layernorm.weight = torch.nn.Parameter(layernorm_weight.to(DEVICE))
-    layernorm.bias = torch.nn.Parameter(layernorm_bias.to(DEVICE))
-
-    inp: torch.Tensor = gen_tensor(
-        (BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE),
-        dtype="float32",
-    ).to(DEVICE)
-    label: torch.Tensor = gen_tensor(
-        (BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE),
-        dtype="float32",
-    ).to(DEVICE)
-
-    output = layernorm(inp)
-    layernorm.zero_grad()
-    output.retain_grad()
-    loss_fn = torch.nn.MSELoss(reduction="mean")
-    loss = loss_fn(output, label)
-    loss.backward()
-    torch.save(output.cpu(), os.path.join(OUT_DIR, "torch_out.pt"))
-    torch.save(output.grad.cpu(), os.path.join(OUT_DIR, "torch_out_grad.pt"))
-    torch.save(layernorm.weight.grad.cpu(), os.path.join(OUT_DIR, "torch_weight_grad.pt"))
-    torch.save(layernorm.bias.grad.cpu(), os.path.join(OUT_DIR, "torch_bias_grad.pt"))
-
-
-if __name__ == "__main__":
-    run()
diff --git a/tests/align/layernorm/align_t5_layernorm_torch.py b/tests/align/layernorm/align_t5_layernorm_torch.py
deleted file mode 100644
index b8ed6ad9d8..0000000000
--- a/tests/align/layernorm/align_t5_layernorm_torch.py
+++ /dev/null
@@ -1,70 +0,0 @@
-import argparse
-import os
-import sys
-
-import torch
-
-sys.path.append("./align/")
-from align_utils import gen_tensor
-
-assert torch.cuda.is_available(), "Expects at least one GPU"
-DEVICE = torch.device(0)
-BATCH_SIZE = 16
-SEQ_LENGTH = 5
-OUT_DIR = os.path.join("align", "layernorm", "out")
-
-
-class T5LayerNorm(torch.nn.Module):
-    """See https://github.com/huggingface/transformers/blob/master/src/transformers/models/t5/modeling_t5.py"""
-
-    def __init__(self, hidden_size, eps=1e-6):
-        """Construct a layernorm module in the T5 style (no bias and no
-        subtraction of mean)."""
-        super().__init__()
-        self.weight = torch.nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        if self.weight.dtype in [torch.float16, torch.bfloat16]:
-            hidden_states = hidden_states.to(self.weight.dtype)
-        return self.weight * hidden_states
-
-
-def run():
-    # Initialize the T5 layer norm and load the weight from FlexFlow
-    HIDDEN_SIZE = 512
-    t5_layernorm = T5LayerNorm(HIDDEN_SIZE).to(DEVICE)
-    t5_layernorm_weight = torch.load(os.path.join(OUT_DIR, "ff_layernorm_weight.pt"))
-    assert t5_layernorm.weight.shape == t5_layernorm_weight.shape, (
-        "Shape mismatch: "
-        f"FF={t5_layernorm_weight.shape} torch={t5_layernorm.weight.shape}"
-    )
-    t5_layernorm.weight = torch.nn.Parameter(t5_layernorm_weight.to(DEVICE))
-
-    inp: torch.Tensor = gen_tensor(
-        (BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE),
-        dtype="float32",
-    ).to(DEVICE)
-    label: torch.Tensor = gen_tensor(
-        (BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE),
-        dtype="float32",
-    ).to(DEVICE)
-
-    output = t5_layernorm(inp)
-    torch.save(output.cpu(), os.path.join(OUT_DIR, "torch_out.pt"))
-
-    t5_layernorm.zero_grad()
-    output.retain_grad()
-    loss_fn = torch.nn.MSELoss(reduction="mean")
-    loss = loss_fn(output, label)
-    loss.backward()
-    torch.save(
-        t5_layernorm.weight.grad.cpu(), os.path.join(OUT_DIR, "torch_weight_grad.pt")
-    )
-    torch.save(output.grad.cpu(), os.path.join(OUT_DIR, "torch_out_grad.pt"))
-
-
-if __name__ == "__main__":
-    run()
diff --git a/tests/align/layernorm/gen_tensors.sh b/tests/align/layernorm/gen_tensors.sh
deleted file mode 100755
index 79ede58e64..0000000000
--- a/tests/align/layernorm/gen_tensors.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#! /usr/bin/env bash
-
-eval "$(conda shell.bash hook)";
-rm align/layernorm/out/*.pt;
-conda activate flexflow;
-./python/flexflow_python align/layernorm/align_layernorm_ff.py -ll:py 1 -ll:gpu 1 -ll:fsize 5000 -ll:zsize 4096 -b 16;
-conda activate pytorch;
-python align/layernorm/align_layernorm_torch.py;
-
diff --git a/tests/align/linear/align_linear_ff.py b/tests/align/linear/align_linear_ff.py
deleted file mode 100644
index e3790ddc43..0000000000
--- a/tests/align/linear/align_linear_ff.py
+++ /dev/null
@@ -1,71 +0,0 @@
-import os
-import sys
-
-import torch
-from flexflow.core import *
-from flexflow.core.flexflow_cffi import Linear, Op, Parameter
-from flexflow.type import AggrMode
-
-sys.path.append("./align/")
-from align_ff_utils import (compile_ffmodel, init_ffmodel, run_fwd_bwd,
-                            save_param_ff, save_param_grad_ff, save_tensor_ff,
-                            save_tensor_grad_ff)
-from align_utils import BATCH_SIZE, gen_tensor
-
-SEQ_LENGTH = 5
-OUT_DIR = os.path.join("align", "linear", "out")
-
-
-def run():
-  # create input, label tensors
-  INPUT_SIZE = 512
-  OUTPUT_SIZE = 128
-  inp: torch.Tensor = gen_tensor(
-      (BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE),
-      dtype="float32"
-  )
-  label: torch.Tensor = gen_tensor(
-      (BATCH_SIZE, SEQ_LENGTH, OUTPUT_SIZE),
-      dtype="float32"
-  )
-
-  # initialize ffmodel object
-  ffconfig = FFConfig()
-  ffmodel = FFModel(ffconfig)
-  input_tensor = ffmodel.create_tensor(inp.shape, DataType.DT_FLOAT)
-  output_tensor = ffmodel.dense(
-      input=input_tensor,
-      out_dim=128,
-      name="linear"
-  )
-
-
-  # compile model 
-  compile_ffmodel(ffmodel)
-
-  # fails here
-  dls = init_ffmodel(ffmodel, ((input_tensor, inp),), label)
-  assert len(dls) == 2
-  inp_dl, label_dl = dls
-
-  # forward/back pass
-  run_fwd_bwd(ffmodel, ffconfig, (inp_dl,), label_dl)
-
-  # get linear layer
-  linear_layer: Op = ffmodel.get_layers()[0]
-  assert isinstance(linear_layer, Linear)
-  linear_weight: Parameter = linear_layer.get_weight_tensor()
-  linear_bias: Parameter = linear_layer.get_bias_tensor()
-
-  # save output data
-  save_tensor_ff(output_tensor, ffmodel, os.path.join(OUT_DIR, "ff_out.pt"))
-  save_tensor_grad_ff(output_tensor, ffmodel, os.path.join(OUT_DIR, "ff_out_grad.pt"))
-  
-  # save layer data
-  save_param_ff(linear_weight, ffmodel, os.path.join(OUT_DIR, "ff_weight.pt"))
-  save_param_ff(linear_bias, ffmodel, os.path.join(OUT_DIR, "ff_bias.pt"))
-  save_param_grad_ff(linear_weight, ffmodel, os.path.join(OUT_DIR, "ff_weight_grad.pt"))
-  save_param_grad_ff(linear_bias, ffmodel, os.path.join(OUT_DIR, "ff_bias_grad.pt"))
-
-if __name__ == "__main__":
-    run()
diff --git a/tests/align/linear/align_linear_torch.py b/tests/align/linear/align_linear_torch.py
deleted file mode 100644
index 4cf65ec2a9..0000000000
--- a/tests/align/linear/align_linear_torch.py
+++ /dev/null
@@ -1,67 +0,0 @@
-import os
-import sys
-
-import torch
-
-sys.path.append("./align/")
-from align_utils import gen_tensor, BATCH_SIZE
-
-assert torch.cuda.is_available(), "Expects at least one GPU"
-DEVICE = torch.device(0)
-SEQ_LENGTH = 5
-OUT_DIR = os.path.join("align", "linear", "out")
-
-def run():
-  # define layer in pytorch
-  INPUT_SIZE = 512
-  OUTPUT_SIZE = 128
-  linear = torch.nn.Linear(
-      in_features=512,
-      out_features=128
-  ).to(DEVICE)
-
-  # get weight/bias from ff files, check same shape
-  linear_weight = torch.load(os.path.join(OUT_DIR, "ff_weight.pt"))
-  linear_bias = torch.load(os.path.join(OUT_DIR, "ff_bias.pt"))
-  assert linear.weight.shape == linear_weight.shape, (
-      "Shape mismatch: " f"FF={linear_weight.shape} torch={linear.weight.shape}"
-  )
-  assert linear.bias.shape == linear_bias.shape, (
-      "Shape mismatch: " f"FF={linear_bias.shape} torch={linear.bias.shape}"
-  )
-
-  # set weight/bias 
-  linear.weight = torch.nn.Parameter(linear_weight.to(DEVICE))
-  linear.bias = torch.nn.Parameter(linear_bias.to(DEVICE))
-
-  # generate input/label tensors w/ gen_tensor 
-  inp: torch.Tensor = gen_tensor(
-      (BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE),
-      dtype="float32"
-  ).to(DEVICE)
-  label: torch.Tensor = gen_tensor(
-      (BATCH_SIZE, SEQ_LENGTH, OUTPUT_SIZE),
-      dtype="float32"
-  ).to(DEVICE)
-
-  # get output running input through layer
-  output = linear(inp)
-  linear.zero_grad()
-  output.retain_grad()
-
-  # loss function
-  loss_fn = torch.nn.MSELoss(reduction="mean")
-  loss = loss_fn(output, label)
-
-  # backpropogate 
-  loss.backward()
-
-  # save out, out grad, layer weight & bias gradients
-  torch.save(output.cpu(), os.path.join(OUT_DIR, "torch_out.pt"))
-  torch.save(output.grad.cpu(), os.path.join(OUT_DIR, "torch_out_grad.pt"))
-  torch.save(linear.weight.grad.cpu(), os.path.join(OUT_DIR, "torch_weight_grad.pt"))
-  torch.save(linear.bias.grad.cpu(), os.path.join(OUT_DIR, "torch_bias_grad.pt"))
-
-
-if __name__ == "__main__":
-    run()
\ No newline at end of file
diff --git a/tests/align/linear/gen_tensors.sh b/tests/align/linear/gen_tensors.sh
deleted file mode 100755
index 8e29825912..0000000000
--- a/tests/align/linear/gen_tensors.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#! /usr/bin/env bash
-
-eval "$(conda shell.bash hook)";
-rm align/linear/out/*.pt;
-conda activate flexflow;
-./python/flexflow_python align/linear/align_linear_ff.py -ll:py 1 -ll:gpu 1 -ll:fsize 5000 -ll:zsize 4096 -b 16;
-conda activate pytorch;
-python align/linear/align_linear_torch.py;
-
diff --git a/tests/align/mt5_encoder/align_mt5_encoder_ff.py b/tests/align/mt5_encoder/align_mt5_encoder_ff.py
deleted file mode 100644
index 7ea4156244..0000000000
--- a/tests/align/mt5_encoder/align_mt5_encoder_ff.py
+++ /dev/null
@@ -1,27 +0,0 @@
-import os
-import sys
-
-from flexflow.core import *
-
-sys.path.append("./align/")
-from align_ff_utils import run_fwd_bwd
-from mt5_ff_utils import init_ff_mt5_encoder
-
-# NOTE: We use the PyTorch mT5 encoder output as the labels
-ENCODER_LABELS_PATH = os.path.join(
-    "align", "mt5_encoder", "out", "hidden_states.pt",
-)
-
-
-def run():
-    assert os.path.exists(ENCODER_LABELS_PATH), \
-        "Make sure to generate the encoder labels file (e.g. by modifying " \
-        "the transformers library source code)"
-    ffmodel, input_dls, label_dl = init_ff_mt5_encoder(
-        ENCODER_LABELS_PATH,
-    )
-    run_fwd_bwd(ffmodel, ffmodel._ffconfig, input_dls, label_dl)
-
-
-if __name__ == "__main__":
-    run()
diff --git a/tests/align/mt5_ff_utils.py b/tests/align/mt5_ff_utils.py
deleted file mode 100644
index 5c343bb533..0000000000
--- a/tests/align/mt5_ff_utils.py
+++ /dev/null
@@ -1,196 +0,0 @@
-import os
-from collections import OrderedDict
-from typing import Optional
-
-import numpy as np
-import torch
-from flexflow.core import *
-from flexflow.torch.model import PyTorchModel, InputNode, OutputNode
-from transformers import MT5ForConditionalGeneration
-
-PRETRAINED_MODEL_NAME = "google/mt5-small"
-
-BASE_DIR = "examples/python/pytorch/mt5"
-DATA_DIR = os.path.join(BASE_DIR, "data")
-BATCH_DIR = os.path.join(DATA_DIR, "batch")
-INPUT_IDS_PATH = os.path.join(BATCH_DIR, "ids.pt")
-ATTENTION_MASK_PATH = os.path.join(BATCH_DIR, "mask.pt")
-DECODER_INPUT_IDS_PATH = os.path.join(BATCH_DIR, "y_ids.pt")
-LABELS_PATH = os.path.join(BATCH_DIR, "lm_labels.pt")
-
-
-def load_batch_ff():
-    """Loads a single batch for mT5, consisting of the encoder input IDs,
-    encoder attention mask, decoder input IDS, and labels, all as numpy
-    arrays."""
-    input_ids = torch.load(INPUT_IDS_PATH).numpy()
-    attention_mask = torch.load(ATTENTION_MASK_PATH).numpy()
-    decoder_input_ids = torch.load(DECODER_INPUT_IDS_PATH).numpy()
-    labels = torch.load(LABELS_PATH).numpy()
-    return (input_ids, attention_mask, decoder_input_ids, labels)
-
-
-def init_ff_mt5():
-    """
-    Initializes the FlexFlow representation of the HuggingFace mT5 model.
-
-    Returns:
-        (ffmodel, input_dls, label_dl)
-
-        ffmodel (FFModel): Compiled and initialized FlexFlow model representing
-            HuggingFace mT5.
-        input_dls (List[SingleDataLoader]): List consisting of the encoder
-            input IDs, encoder attention mask, and decoder input IDs
-            dataloaders.
-        label_dl (SingleDataLoader): Label dataloader.
-    """
-    ffconfig = FFConfig()
-    ffmodel = FFModel(ffconfig)
-    mt5_torch = MT5ForConditionalGeneration.from_pretrained(
-        PRETRAINED_MODEL_NAME,
-    )
-    input_ids, attention_mask, decoder_input_ids, labels = load_batch_ff()
-    input_tensors = [
-        ffmodel.create_tensor(input_ids.shape, DataType.DT_INT64),
-        ffmodel.create_tensor(attention_mask.shape, DataType.DT_INT64),
-        ffmodel.create_tensor(decoder_input_ids.shape, DataType.DT_INT64),
-    ]
-    mt5_model = PyTorchModel(
-        mt5_torch,
-        is_hf_model=True,
-        input_names=["input_ids", "attention_mask", "decoder_input_ids"],
-        batch_size=ffconfig.batch_size,
-        seq_length=(input_ids.shape[1], decoder_input_ids.shape[1]),
-    )
-    output_tensors = mt5_model.torch_to_ff(ffmodel, input_tensors)
-    ffoptimizer = SGDOptimizer(ffmodel, lr=0.01)
-    ffmodel.compile(
-        optimizer=ffoptimizer,
-        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
-        metrics=[
-            MetricsType.METRICS_ACCURACY,
-            MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY,
-        ],
-    )
-    input_ids_dl = ffmodel.create_data_loader(input_tensors[0], input_ids)
-    attention_mask_dl = ffmodel.create_data_loader(
-        input_tensors[1], attention_mask,
-    )
-    decoder_input_ids_dl = ffmodel.create_data_loader(
-        input_tensors[2], decoder_input_ids,
-    )
-    # NOTE: We cast down the label tensor data to 32-bit to accomomodate the
-    # label tensor's bitwidth requirement
-    label_dl = ffmodel.create_data_loader(
-        ffmodel.label_tensor, labels.astype("int32"),
-    )
-    input_dls = [input_ids_dl, attention_mask_dl, decoder_input_ids_dl]
-    ffmodel.init_layers()
-    return (ffmodel, input_dls, label_dl)
-
-
-def extract_mt5_subgraph(
-    initial_op_name: Optional[str] = None,
-    final_op_name: Optional[str] = None,
-):
-    """
-    Extracts the mT5 subgraph starting from ``initial_op_name`` and ending
-    with ``final_op_name`` (inclusive) in the topological order. If either
-    argument is ``None``, then that side of the limit defaults to the first
-    and last operator, respectively.
-
-    NOTE: HuggingFace's symbolic trace only supports tracing a selection of
-    classes. As a result, we must extract subgraphs from the full mT5 graph
-    in the Python FlexFlow space.
-
-    Returns:
-        subgraph (List[Node]): List of the nodes comprising the subgraph.
-    """
-    mt5_torch = MT5ForConditionalGeneration.from_pretrained(
-        PRETRAINED_MODEL_NAME,
-    )
-    input_ids, _, decoder_input_ids, _ = load_batch_ff()
-    BATCH_SIZE = 8
-    mt5_model = PyTorchModel(
-        mt5_torch,
-        is_hf_model=True,
-        input_names=["input_ids", "attention_mask", "decoder_input_ids"],
-        batch_size=BATCH_SIZE,
-        seq_length=(input_ids.shape[1], decoder_input_ids.shape[1]),
-    )
-    graph = mt5_model._trace_model()
-    subgraph = []
-    in_subgraph: bool = initial_op_name is None
-    for node in graph:
-        if initial_op_name is not None and node.name == initial_op_name:
-            in_subgraph = True
-        if in_subgraph:
-            subgraph.append(node)
-        if final_op_name is not None and node.name == final_op_name:
-            break
-    return subgraph
-
-
-def extract_mt5_encoder():
-    """Extracts the mT5 subgraph corresponding to the encoder only."""
-    return extract_mt5_subgraph(final_op_name="encoder_dropout_1")
-
-
-def init_ff_mt5_encoder(encoder_labels_filepath: str):
-    """
-    Initializes the FlexFlow representation of the HuggingFace mT5 model's
-    encoder.
-
-    Returns:
-        (ffmodel, input_dls, label_dl)
-
-        ffmodel (FFModel): Compiled and initialized FlexFlow model representing
-            HuggingFace mT5's encoder.
-        input_dls (List[SingleDataLoader]): List consisting of the encoder
-            input IDs, encoder attention mask, and decoder input IDs
-            dataloaders.
-        label_dl (SingleDataLoader): Label dataloader.
-    """
-    ffconfig = FFConfig()
-    ffmodel = FFModel(ffconfig)
-    input_ids, attention_mask, decoder_input_ids, _ = load_batch_ff()
-    labels = torch.load(encoder_labels_filepath).detach().numpy()
-    input_tensors = [
-        ffmodel.create_tensor(input_ids.shape, DataType.DT_INT64),
-        ffmodel.create_tensor(attention_mask.shape, DataType.DT_INT64),
-        ffmodel.create_tensor(decoder_input_ids.shape, DataType.DT_INT64),
-    ]
-    # Add the encoder operators to `ffmodel`
-    mt5_encoder_graph = extract_mt5_encoder()
-    input_index = 0
-    output_tensors = []
-    node_to_output = OrderedDict()
-    for node in mt5_encoder_graph:
-        if isinstance(node, InputNode):
-            node_output = node.to_ff(input_tensors, input_index)
-            input_index += 1
-        elif isinstance(node, OutputNode):
-            node.to_ff(ffmodel, node_to_output, output_tensors)
-            node_output = None
-        else:
-            node_output = node.to_ff(ffmodel, node_to_output)
-        if node_output is not None:
-            node_to_output[node.name] = node_output
-    # Compile and initialize the model
-    ffoptimizer = SGDOptimizer(ffmodel, lr=0.01)
-    ffmodel.compile(
-        optimizer=ffoptimizer,
-        loss_type=LossType.LOSS_MEAN_SQUARED_ERROR_AVG_REDUCE,
-        metrics=[MetricsType.METRICS_MEAN_SQUARED_ERROR],
-    )
-    input_ids_dl = ffmodel.create_data_loader(input_tensors[0], input_ids)
-    attention_mask_dl = ffmodel.create_data_loader(
-        input_tensors[1], attention_mask,
-    )
-    decoder_input_ids_dl = ffmodel.create_data_loader(
-        input_tensors[2], decoder_input_ids,
-    )
-    label_dl = ffmodel.create_data_loader(ffmodel.label_tensor, labels)
-    input_dls = [input_ids_dl, attention_mask_dl, decoder_input_ids_dl]
-    ffmodel.init_layers()
-    return (ffmodel, input_dls, label_dl)
diff --git a/tests/align/multiply/align_multiply_ff.py b/tests/align/multiply/align_multiply_ff.py
deleted file mode 100644
index 2320bb40dc..0000000000
--- a/tests/align/multiply/align_multiply_ff.py
+++ /dev/null
@@ -1,61 +0,0 @@
-import os
-import sys
-
-import torch
-from flexflow.core import *
-from flexflow.core.flexflow_cffi import Linear, Op, Parameter
-from flexflow.type import AggrMode
-
-sys.path.append("./align/")
-from align_ff_utils import (compile_ffmodel, init_ffmodel, run_fwd_bwd,
-                            save_param_ff, save_param_grad_ff, save_tensor_ff,
-                            save_tensor_grad_ff)
-from align_utils import BATCH_SIZE, gen_tensor
-
-OUT_DIR = os.path.join("align", "multiply", "out")
-
-
-def run():
-  INPUT_SIZE = 512
-  SEQ_LENGTH = 5
-  inp1: torch.Tensor = gen_tensor(
-      (BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE),
-      dtype="float32"
-  )
-  inp2: torch.Tensor = gen_tensor(
-      (BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE),
-      dtype="float32"
-  )
-  label: torch.Tensor = gen_tensor(
-      (BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE),
-      dtype="float32"
-  )
-
-  ffconfig = FFConfig()
-  ffmodel = FFModel(ffconfig)
-  input_tensor_1 = ffmodel.create_tensor(inp1.shape, DataType.DT_FLOAT)
-  input_tensor_2 = ffmodel.create_tensor(inp2.shape, DataType.DT_FLOAT)
-  output_tensor = ffmodel.multiply(
-      x=input_tensor_1,
-      y=input_tensor_2,
-      name="multiply"
-  )
-
-  # compile 
-  compile_ffmodel(ffmodel)
-  dls = init_ffmodel(ffmodel, ((input_tensor_1, inp1), (input_tensor_2, inp2)), label)
-  assert len(dls) == 3
-  inp1_dl, inp2_dl, label_dl = dls
-
-  # forward/backward pass
-  run_fwd_bwd(ffmodel, ffconfig, (inp1_dl, inp2_dl), label_dl)
-
-  # save data
-  save_tensor_ff(output_tensor, ffmodel, os.path.join(OUT_DIR, "ff_out.pt"))
-  save_tensor_grad_ff(output_tensor, ffmodel, os.path.join(OUT_DIR, "ff_out_grad.pt"))
-
-  
-  
-
-if __name__ == "__main__":
-    run()
diff --git a/tests/align/multiply/align_multiply_torch.py b/tests/align/multiply/align_multiply_torch.py
deleted file mode 100644
index 47819ed666..0000000000
--- a/tests/align/multiply/align_multiply_torch.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import os
-import sys
-
-import torch
-
-sys.path.append("./align/")
-from align_utils import gen_tensor, BATCH_SIZE
-
-assert torch.cuda.is_available(), "Expects at least one GPU"
-DEVICE = torch.device(0)
-OUT_DIR = os.path.join("align", "multiply", "out")
-
-def run():
-  INPUT_SIZE = 512
-  SEQ_LENGTH = 5
-
-  inp1: torch.Tensor = gen_tensor(
-      (BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE),
-      dtype="float32"
-  ).to(DEVICE)
-  inp2: torch.Tensor = gen_tensor(
-      (BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE),
-      dtype="float32"
-  ).to(DEVICE)
-  label: torch.Tensor = gen_tensor(
-      (BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE),
-      dtype="float32"
-  ).to(DEVICE)
-  output = torch.mul(
-      input=inp1,
-      other=inp2
-  ).to(DEVICE)
-  output.requires_grad = True
-  output.retain_grad()
-
-  loss_fn = torch.nn.MSELoss(reduction="mean")
-  loss = loss_fn(output, label)
-  loss.backward()
-  torch.save(output.cpu(), os.path.join(OUT_DIR, "torch_out.pt"))
-  torch.save(output.grad.cpu(), os.path.join(OUT_DIR, "torch_out_grad.pt"))
-
-
-if __name__ == "__main__":
-    run()
\ No newline at end of file
diff --git a/tests/align/multiply/gen_tensors.sh b/tests/align/multiply/gen_tensors.sh
deleted file mode 100755
index 32c9553d16..0000000000
--- a/tests/align/multiply/gen_tensors.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#! /usr/bin/env bash
-
-eval "$(conda shell.bash hook)";
-rm align/multiply/out/*.pt;
-conda activate flexflow;
-./python/flexflow_python align/multiply/align_multiply_ff.py -ll:py 1 -ll:gpu 1 -ll:fsize 5000 -ll:zsize 4096 -b 16;
-conda activate pytorch;
-python align/multiply/align_multiply_torch.py;
-
diff --git a/tests/align/subtract/align_subtract_ff.py b/tests/align/subtract/align_subtract_ff.py
deleted file mode 100644
index dd9659cf6f..0000000000
--- a/tests/align/subtract/align_subtract_ff.py
+++ /dev/null
@@ -1,61 +0,0 @@
-import os
-import sys
-
-import torch
-from flexflow.core import *
-from flexflow.core.flexflow_cffi import Linear, Op, Parameter
-from flexflow.type import AggrMode
-
-sys.path.append("./align/")
-from align_ff_utils import (compile_ffmodel, init_ffmodel, run_fwd_bwd,
-                            save_param_ff, save_param_grad_ff, save_tensor_ff,
-                            save_tensor_grad_ff)
-from align_utils import BATCH_SIZE, gen_tensor
-
-OUT_DIR = os.path.join("align", "subtract", "out")
-
-
-def run():
-  INPUT_SIZE = 512
-  SEQ_LENGTH = 5
-  inp1: torch.Tensor = gen_tensor(
-      (BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE),
-      dtype="float32"
-  )
-  inp2: torch.Tensor = gen_tensor(
-      (BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE),
-      dtype="float32"
-  )
-  label: torch.Tensor = gen_tensor(
-      (BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE),
-      dtype="float32"
-  )
-
-  ffconfig = FFConfig()
-  ffmodel = FFModel(ffconfig)
-  input_tensor_1 = ffmodel.create_tensor(inp1.shape, DataType.DT_FLOAT)
-  input_tensor_2 = ffmodel.create_tensor(inp2.shape, DataType.DT_FLOAT)
-  output_tensor = ffmodel.subtract(
-      x=input_tensor_1,
-      y=input_tensor_2,
-      name="subtract"
-  )
-
-  # compile 
-  compile_ffmodel(ffmodel)
-  dls = init_ffmodel(ffmodel, ((input_tensor_1, inp1), (input_tensor_2, inp2)), label)
-  assert len(dls) == 3
-  inp1_dl, inp2_dl, label_dl = dls
-
-  # forward/backward pass
-  run_fwd_bwd(ffmodel, ffconfig, (inp1_dl, inp2_dl), label_dl)
-
-  # save data
-  save_tensor_ff(output_tensor, ffmodel, os.path.join(OUT_DIR, "ff_out.pt"))
-  save_tensor_grad_ff(output_tensor, ffmodel, os.path.join(OUT_DIR, "ff_out_grad.pt"))
-
-  
-  
-
-if __name__ == "__main__":
-    run()
diff --git a/tests/align/subtract/align_subtract_torch.py b/tests/align/subtract/align_subtract_torch.py
deleted file mode 100644
index 1393d40eb7..0000000000
--- a/tests/align/subtract/align_subtract_torch.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import os
-import sys
-
-import torch
-
-sys.path.append("./align/")
-from align_utils import gen_tensor, BATCH_SIZE
-
-assert torch.cuda.is_available(), "Expects at least one GPU"
-DEVICE = torch.device(0)
-OUT_DIR = os.path.join("align", "subtract", "out")
-
-def run():
-  INPUT_SIZE = 512
-  SEQ_LENGTH = 5
-
-  inp1: torch.Tensor = gen_tensor(
-      (BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE),
-      dtype="float32"
-  ).to(DEVICE)
-  inp2: torch.Tensor = gen_tensor(
-      (BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE),
-      dtype="float32"
-  ).to(DEVICE)
-  label: torch.Tensor = gen_tensor(
-      (BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE),
-      dtype="float32"
-  ).to(DEVICE)
-  output = torch.sub(
-      input=inp1,
-      other=inp2
-  ).to(DEVICE)
-  output.requires_grad = True
-  output.retain_grad()
-
-  loss_fn = torch.nn.MSELoss(reduction="mean")
-  loss = loss_fn(output, label)
-  loss.backward()
-  torch.save(output.cpu(), os.path.join(OUT_DIR, "torch_out.pt"))
-  torch.save(output.grad.cpu(), os.path.join(OUT_DIR, "torch_out_grad.pt"))
-
-
-if __name__ == "__main__":
-    run()
\ No newline at end of file
diff --git a/tests/align/subtract/gen_tensors.sh b/tests/align/subtract/gen_tensors.sh
deleted file mode 100755
index ae25ef0bf7..0000000000
--- a/tests/align/subtract/gen_tensors.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#! /usr/bin/env bash
-
-eval "$(conda shell.bash hook)";
-rm align/subtract/out/*.pt;
-conda activate flexflow;
-./python/flexflow_python align/subtract/align_subtract_ff.py -ll:py 1 -ll:gpu 1 -ll:fsize 5000 -ll:zsize 4096 -b 16;
-conda activate pytorch;
-python align/subtract/align_subtract_torch.py;
-
diff --git a/tests/align/test_all_operators.sh b/tests/align/test_all_operators.sh
deleted file mode 100755
index 02e0934dfc..0000000000
--- a/tests/align/test_all_operators.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#! /usr/bin/env bash
-eval "$(conda shell.bash hook)"
-
-rm -rf align/out
-
-function generate_ff_tensor(){
-    ./python/flexflow_python tests/align/align_create_tensor_ff.py -ll:py 1 -ll:gpu 1 -ll:fsize 5000 -ll:zsize 4096 -b 16 -o "$1"
-}
-
-function generate_torch_tensor(){
-    python tests/align/align_create_tensor_torch.py -o "$1"
-}
-
-ops=(add concat conv2d cos embedding exp flat getitem identity multiply pool2d reducesum relu reshape scalar_add scalar_multiply scalar_sub scalar_truediv sigmoid sin subtract tanh transpose view_embedding max min linear gather)
-
-#create flexflow tensors
-conda activate flexflow
-conda info --envs
-for(( i=0;i<${#ops[@]};i++)) 
-do
-    generate_ff_tensor "${ops[i]}";
-done;
-
-#create torch tensorss
-conda activate pytorch
-for(( i=0;i<${#ops[@]};i++)) 
-do
-    generate_torch_tensor "${ops[i]}";
-done;
-
-conda activate flexflow
-python -m pytest tests/align/align_test.py
diff --git a/tests/align/view_embedding/align_view_embedding_ff.py b/tests/align/view_embedding/align_view_embedding_ff.py
deleted file mode 100644
index c58dfd713d..0000000000
--- a/tests/align/view_embedding/align_view_embedding_ff.py
+++ /dev/null
@@ -1,68 +0,0 @@
-import os
-import sys
-
-import torch
-from flexflow.core import *
-from flexflow.core.flexflow_cffi import Embedding, Op, Parameter
-from flexflow.torch.model import FunctionNode
-from flexflow.type import AggrMode
-
-sys.path.append("./align/")
-from align_ff_utils import (compile_ffmodel, init_ffmodel, run_fwd_bwd,
-                            save_param_ff, save_param_grad_ff, save_tensor_ff,
-                            save_tensor_grad_ff)
-from align_utils import BATCH_SIZE, gen_tensor
-
-SEQ_LENGTH = 5
-OUT_DIR = os.path.join("align", "view_embedding", "out")
-
-
-def run():
-    NUM_EMBEDDINGS = 250112
-    EMBEDDING_DIM = 512
-    inp: torch.Tensor = gen_tensor(
-        (BATCH_SIZE, SEQ_LENGTH),
-        dtype="int64",
-        low=0,
-        high=NUM_EMBEDDINGS,
-    )
-    label: torch.Tensor = gen_tensor(
-        (BATCH_SIZE, SEQ_LENGTH, EMBEDDING_DIM),
-        dtype="float32",
-    )
-
-    ffconfig = FFConfig()
-    ffmodel = FFModel(ffconfig)
-    input_tensor = ffmodel.create_tensor(inp.shape, DataType.DT_INT64)
-    # Treat `view()` as a special case of `reshape()`
-    view_tensor = ffmodel.reshape(
-        input=input_tensor,
-        shape=FunctionNode.get_view_shape(input_tensor, (-1, inp.shape[-1])),
-        name="view",
-    )
-    output_tensor = ffmodel.embedding(
-        input=view_tensor,
-        num_embeddings=NUM_EMBEDDINGS,
-        embedding_dim=EMBEDDING_DIM,
-        aggr=AggrMode.AGGR_MODE_NONE,
-        kernel_initializer=NormInitializer(seed=42, mean=0, stddev=1),
-        name="embedding",
-    )
-
-    compile_ffmodel(ffmodel)
-    dls = init_ffmodel(ffmodel, ((input_tensor, inp),), label)
-    assert len(dls) == 2
-    inp_dl, label_dl = dls
-    run_fwd_bwd(ffmodel, ffconfig, (inp_dl,), label_dl)
-
-    embedding_layer: Op = ffmodel.get_layers()[1]
-    assert isinstance(embedding_layer, Embedding)
-    embedding_weight: Parameter = embedding_layer.get_weight_tensor()
-    save_tensor_ff(output_tensor, ffmodel, os.path.join(OUT_DIR, "ff_out.pt"))
-    save_tensor_grad_ff(output_tensor, ffmodel, os.path.join(OUT_DIR, "ff_out_grad.pt"))
-    save_param_ff(embedding_weight, ffmodel, os.path.join(OUT_DIR, "ff_weight.pt"))
-    save_param_grad_ff(embedding_weight, ffmodel, os.path.join(OUT_DIR, "ff_weight_grad.pt"))
-
-
-if __name__ == "__main__":
-    run()
diff --git a/tests/align/view_embedding/align_view_embedding_torch.py b/tests/align/view_embedding/align_view_embedding_torch.py
deleted file mode 100644
index b6f696cabf..0000000000
--- a/tests/align/view_embedding/align_view_embedding_torch.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import os
-import sys
-
-import torch
-
-sys.path.append("./align/")
-from align_utils import gen_tensor, BATCH_SIZE
-
-assert torch.cuda.is_available(), "Expects at least one GPU"
-DEVICE = torch.device(0)
-SEQ_LENGTH = 5
-OUT_DIR = os.path.join("align", "view_embedding", "out")
-
-
-def run():
-    NUM_EMBEDDINGS = 250112
-    EMBEDDING_DIM = 512
-    embedding = torch.nn.Embedding(
-        num_embeddings=NUM_EMBEDDINGS,
-        embedding_dim=EMBEDDING_DIM,
-        device=DEVICE,
-    )
-    embedding_weight = torch.load(os.path.join(OUT_DIR, "ff_weight.pt"))
-    assert embedding_weight.shape == embedding.weight.shape, \
-        "Shape mismatch: " \
-        f"FF={embedding_weight.shape} torch={embedding.weight.shape}"
-    embedding.weight = torch.nn.Parameter(embedding_weight.to(DEVICE))
-
-    inp: torch.Tensor = gen_tensor(
-        (BATCH_SIZE, SEQ_LENGTH),
-        dtype="int64",
-        low=0,
-        high=NUM_EMBEDDINGS,
-    ).to(DEVICE)
-    label: torch.Tensor = gen_tensor(
-        (BATCH_SIZE, SEQ_LENGTH, EMBEDDING_DIM),
-        dtype="float32",
-    ).to(DEVICE)
-
-    output = embedding(inp.view(-1, inp.shape[-1]))
-    embedding.zero_grad()
-    output.retain_grad()
-    loss_fn = torch.nn.MSELoss(reduction="mean")
-    loss = loss_fn(output, label)
-    loss.backward()
-
-    torch.save(output.cpu(), os.path.join(OUT_DIR, "torch_out.pt"))
-    torch.save(output.grad.cpu(), os.path.join(OUT_DIR, "torch_out_grad.pt"))
-    torch.save(embedding.weight.grad.cpu(), os.path.join(OUT_DIR, "torch_weight_grad.pt"))
-
-
-if __name__ == "__main__":
-    run()
diff --git a/tests/align/view_embedding/gen_tensors.sh b/tests/align/view_embedding/gen_tensors.sh
deleted file mode 100755
index 716b16590d..0000000000
--- a/tests/align/view_embedding/gen_tensors.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#! /usr/bin/env bash
-
-eval "$(conda shell.bash hook)";
-rm align/view/out/*.pt;
-conda activate flexflow;
-./python/flexflow_python align/view_embedding/align_view_embedding_ff.py -ll:py 1 -ll:gpu 1 -ll:fsize 5000 -ll:zsize 4096 -b 16;
-conda activate pytorch;
-python align/view_embedding/align_view_embedding_torch.py;
-
diff --git a/tests/cpp_gpu_tests.sh b/tests/cpp_gpu_tests.sh
deleted file mode 100755
index 53e8f49fa2..0000000000
--- a/tests/cpp_gpu_tests.sh
+++ /dev/null
@@ -1,83 +0,0 @@
-#! /usr/bin/env bash
-set -e
-
-# Cd into directory holding this script
-cd "${BASH_SOURCE[0]%/*}"
-
-if [ -z "$FF_HOME" ]; then echo "FF_HOME variable is not defined, aborting tests"; exit 1; fi
-GPUS=$1
-BATCHSIZE=$((GPUS * 64))
-FSIZE=13800
-ZSIZE=12192
-
-remove_mnist() {
-	rm -f train-images-idx3-ubyte.gz train-labels-idx1-ubyte.gz train-images-idx3-ubyte train-labels-idx1-ubyte
-}
-
-download_mnist() {
-	if [[ ! -f train-images-idx3-ubyte || ! -f train-labels-idx1-ubyte ]]; then
-		remove_mnist
-		wget http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
-		wget http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
-		gzip -d train-images-idx3-ubyte.gz
-		gzip -d train-labels-idx1-ubyte.gz
-	fi
-}
-
-# Check if the AlexNet/alexnet example exists in the build folder. If so, run the tests out of the build folder
-# Otherwise, look for the example binaries in the folders in the PATH, plus in the subdirectory of the flexflow
-# Python package (if it exists)
-if [[ -f "$FF_HOME/build/examples/cpp/AlexNet/alexnet" ]]; then
-	echo "Running C++ tests from folder: $FF_HOME/build/examples/cpp"
-	"$FF_HOME"/build/examples/cpp/AlexNet/alexnet -ll:gpu 1 -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel
-	# TODO: fix DLRM test
-	# "$FF_HOME"/build/examples/cpp/DLRM/dlrm -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel
-	"$FF_HOME"/build/examples/cpp/InceptionV3/inception -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel
-	"$FF_HOME"/build/examples/cpp/MLP_Unify/mlp_unify -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel
-	"$FF_HOME"/build/examples/cpp/ResNet/resnet -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel
-	"$FF_HOME"/build/examples/cpp/Transformer/transformer -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b $((GPUS * 8)) --only-data-parallel
-	"$FF_HOME"/build/examples/cpp/XDL/xdl -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel
-	"$FF_HOME"/build/examples/cpp/candle_uno/candle_uno -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel
-	download_mnist
-	"$FF_HOME"/build/examples/cpp/mixture_of_experts/moe -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b 64 --only-data-parallel
-	remove_mnist
-	"$FF_HOME"/build/examples/cpp/resnext50/resnext50 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel
-	# TODO: fix split tests
-	# "$FF_HOME"/build/examples/cpp/split_test/split_test -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel
-	# "$FF_HOME"/build/examples/cpp/split_test_2/split_test_2 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel
-else
-	python_packages=$(python -c "from distutils import sysconfig; print(sysconfig.get_python_lib(plat_specific=False,standard_lib=False))")
-	OLD_PATH="$PATH"
-	OLD_LD_LIBRARY_PATH="$LD_LIBRARY_PATH"
-	export PATH="${python_packages}/flexflow/bin:${PATH}"
-	export LD_LIBRARY_PATH="${python_packages}/flexflow/lib:${LD_LIBRARY_PATH}"
-	IFS=:
-	found=false
-	for path in $PATH; do
-		if [[ -f "$path/alexnet" ]]; then
-			echo "Running C++ tests from folder: $path"
-			found=true
-			alexnet -ll:gpu 1 -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel
-			# TODO: fix DLRM test
-			# dlrm -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel
-			inception -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel
-			mlp_unify -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel
-			resnet -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel
-			transformer -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b $((GPUS * 8)) --only-data-parallel
-			xdl -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel
-			candle_uno -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel
-			download_mnist
-			moe -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b 64 --only-data-parallel
-			remove_mnist
-			resnext50 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel
-			# TODO: fix split tests 
-			# split_test -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel
-			# split_test_2 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel
-		fi
-	done
-	export PATH="$OLD_PATH"
-	export LD_LIBRARY_PATH="$OLD_LD_LIBRARY_PATH"
-	if [ ! $found ]; then echo "C++ test binaries not found"; exit 1; fi
-fi
-
-
diff --git a/tests/multi_gpu_tests.sh b/tests/multi_gpu_tests.sh
deleted file mode 100755
index 9815a7fd20..0000000000
--- a/tests/multi_gpu_tests.sh
+++ /dev/null
@@ -1,74 +0,0 @@
-#! /usr/bin/env bash
-set -x
-set -e
-
-# Default to single-node, single GPU
-GPUS=${1:-1} # number of GPUS per node
-NUM_NODES=${2:-1} # number of nodes
-BATCHSIZE=$(( NUM_NODES * GPUS * 64))
-FSIZE=13800
-ZSIZE=12192
-
-if [ -z "$FF_HOME" ]; then echo "FF_HOME variable is not defined, aborting tests"; exit; fi
-
-if [[ $NUM_NODES -gt 1 ]]; then
-    export GPUS
-    export NUM_NODES
-    EXE="$FF_HOME"/tests/multinode_helpers/mpi_wrapper1.sh
-else
-    EXE="$FF_HOME"/python/flexflow_python
-fi
-
-echo "Running GPU tests with $NUM_NODES node(s) and $GPUS gpu(s)/node"
-GPU_AVAILABLE=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
-GPU_REQUESTED=$(( GPUS * NUM_NODES))
-if [ $GPU_REQUESTED -gt $(( GPU_AVAILABLE )) ]; then echo "The test requires $GPU_REQUESTED GPUs, but only $GPU_AVAILABLE are available. Try reducing the number of nodes, or the number of gpus/node." ; exit; fi
-
-#Sequential model tests
-$EXE "$FF_HOME"/examples/python/keras/seq_mnist_mlp.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel
-$EXE "$FF_HOME"/examples/python/keras/seq_mnist_cnn.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel
-$EXE "$FF_HOME"/examples/python/keras/seq_reuters_mlp.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel
-$EXE "$FF_HOME"/examples/python/keras/seq_cifar10_cnn.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel
-$EXE "$FF_HOME"/examples/python/keras/seq_mnist_mlp_net2net.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel
-$EXE "$FF_HOME"/examples/python/keras/seq_mnist_cnn_net2net.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel
-$EXE "$FF_HOME"/examples/python/keras/seq_mnist_cnn_nested.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel
-
-#Keras other
-$EXE "$FF_HOME"/examples/python/keras/callback.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel
-$EXE "$FF_HOME"/examples/python/keras/unary.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel
-$EXE "$FF_HOME"/examples/python/keras/reshape.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel
-$EXE "$FF_HOME"/examples/python/keras/elementwise_mul_broadcast.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel
-$EXE "$FF_HOME"/examples/python/keras/reduce_sum.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel
-$EXE "$FF_HOME"/examples/python/keras/identity_loss.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel 
-$EXE "$FF_HOME"/examples/python/keras/elementwise_max_min.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel 
-$EXE "$FF_HOME"/examples/python/keras/rsqrt.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel
-$EXE "$FF_HOME"/examples/python/keras/gather.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel
-$EXE "$FF_HOME"/examples/python/keras/regularizer.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel
-
-#Functional API
-$EXE "$FF_HOME"/examples/python/keras/func_mnist_mlp.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel
-$EXE "$FF_HOME"/examples/python/keras/func_mnist_mlp_concat.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel
-$EXE "$FF_HOME"/examples/python/keras/func_mnist_mlp_concat2.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel
-$EXE "$FF_HOME"/examples/python/keras/func_mnist_cnn.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel
-$EXE "$FF_HOME"/examples/python/keras/func_mnist_cnn_concat.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel
-$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel
-$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn_nested.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel
-$EXE "$FF_HOME"/examples/python/keras/func_cifar10_alexnet.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel
-$EXE "$FF_HOME"/examples/python/keras/func_mnist_mlp_net2net.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel
-$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn_net2net.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel
-
-#Python
-$EXE "$FF_HOME"/examples/python/native/print_layers.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" --epochs 5 -b ${BATCHSIZE} --only-data-parallel
-$EXE "$FF_HOME"/examples/python/native/split.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel
-$EXE "$FF_HOME"/examples/python/native/alexnet.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" --epochs 40 --only-data-parallel
-$EXE "$FF_HOME"/examples/python/native/mnist_mlp.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" --epochs 5 -b ${BATCHSIZE} --only-data-parallel
-$EXE "$FF_HOME"/examples/python/native/mnist_cnn.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" --epochs 5 -b ${BATCHSIZE} --only-data-parallel
-$EXE "$FF_HOME"/examples/python/native/cifar10_cnn.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" --epochs 40 --only-data-parallel
-$EXE "$FF_HOME"/examples/python/native/cifar10_cnn_attach.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" --epochs 5 --only-data-parallel
-$EXE "$FF_HOME"/examples/python/native/mnist_mlp_attach.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" --epochs 5 --only-data-parallel
-
-#Possible crash
-$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn_concat.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel
-$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn_concat_model.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel
-$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn_concat_seq_model.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel
-$EXE "$FF_HOME"/examples/python/native/cifar10_cnn_concat.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" --epochs 40 --only-data-parallel
diff --git a/tests/multinode_helpers/mpi_wrapper1.sh b/tests/multinode_helpers/mpi_wrapper1.sh
deleted file mode 100755
index 87d17d11a3..0000000000
--- a/tests/multinode_helpers/mpi_wrapper1.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#! /usr/bin/env bash
-set -x
-set -e
-
-if [ -z "$FF_HOME" ]; then echo "FF_HOME variable is not defined, aborting tests"; exit; fi
-if [ -z "$NUM_NODES" ]; then echo "NUM_NODES variable is not defined, aborting tests"; exit; fi
-if [ -z "$GPUS" ]; then echo "GPUS variable is not defined, aborting tests"; exit; fi
-
-# We need to wrap the instruction below in its own script because MPI throws an error if we try
-# to run "mpirun" more than once in the same script. Hence, we cannot simply call "mpirun" in the
-# multi_gpu_tests.sh script
-mpirun -np "$NUM_NODES" "$FF_HOME"/tests/multinode_helpers/mpi_wrapper2.sh "$@"
diff --git a/tests/multinode_helpers/mpi_wrapper2.sh b/tests/multinode_helpers/mpi_wrapper2.sh
deleted file mode 100755
index d16cbef6dc..0000000000
--- a/tests/multinode_helpers/mpi_wrapper2.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#! /usr/bin/env bash
-set -x
-set -e
-
-if [ -z "$FF_HOME" ]; then echo "FF_HOME variable is not defined, aborting tests"; exit; fi
-if [ -z "$NUM_NODES" ]; then echo "NUM_NODES variable is not defined, aborting tests"; exit; fi
-if [ -z "$GPUS" ]; then echo "GPUS variable is not defined, aborting tests"; exit; fi
-
-# We need to wrap the instruction below in its own script because the CUDA_VISIBLE_DEVICES environment
-# variable will need to be set differently for each node, but the "mpirun" command should take a single
-# executable as its first argument
-CUDA_VISIBLE_DEVICES=$(seq -s, $((OMPI_COMM_WORLD_RANK * GPUS ))  $(( OMPI_COMM_WORLD_RANK * GPUS +1 )) )
-export CUDA_VISIBLE_DEVICES
-
-EXE="$FF_HOME"/python/flexflow_python
-
-$EXE "$@"
-
diff --git a/tests/ops/batch_matmul_test.cc b/tests/ops/batch_matmul_test.cc
deleted file mode 100644
index 7931f44129..0000000000
--- a/tests/ops/batch_matmul_test.cc
+++ /dev/null
@@ -1,124 +0,0 @@
-#include "model.h"
-#include "test_utils.h"
-#include <fstream>
-#include <iomanip>
-#include <iostream>
-#include <sstream>
-using namespace Legion;
-LegionRuntime::Logger::Category log_app("bmm_test");
-
-struct BMMTestMeta {
-  int m, k, n, d;
-  BMMTestMeta(int _m, int _k, int _n, int _d) {
-    m = _m, k = _k, n = _n, d = _d;
-  }
-};
-
-BMMTestMeta get_test_meta(const std::string file_path) {
-  std::fstream myfile(file_path, std::ios_base::in);
-  int m, k, n, d;
-  myfile >> m >> k >> n >> d;
-  return BMMTestMeta(m, k, n, d);
-}
-
-void top_level_task(Task const *task,
-                    std::vector<PhysicalRegion> const &regions,
-                    Context ctx,
-                    Runtime *runtime) {
-  // std::cout<< "test framework launched" << std::endl;
-  auto test_meta = get_test_meta("test_meta.txt");
-  FFConfig ffConfig;
-  // create ff model object
-  FFModel ff(ffConfig);
-  // create input tensor
-  Tensor dense_input1;
-  {
-    int const dims[3] = {
-        test_meta.d, test_meta.k, test_meta.m}; // target shape (d,k,m)
-    // HACK: have to pass "batch_matmul" 3-dimensional strategy string id to
-    // tell FF to distribute this tensor correctly
-    dense_input1 = ff.create_tensor<3>(dims, "batch_matmul", DT_FLOAT);
-  }
-  Tensor dense_input2;
-  {
-    int const dims[3] = {
-        test_meta.d, test_meta.k, test_meta.n}; // shape (n,k,d)
-    // HACK: have to pass "batch_matmul" 3-dimensional strategy string id to
-    // tell FF to distribute this tensor correctly
-    dense_input2 = ff.create_tensor<3>(dims, "batch_matmul", DT_FLOAT);
-  }
-  // build batch matmul layer
-  Tensor batch_matmul_ret = ff.batch_matmul("batch_matmul",
-                                            dense_input1,
-                                            dense_input2,
-                                            true /* trans_a */,
-                                            false /* trans_b */);
-  // load inputs tensors and output gradients tensors for testing
-  auto input1_file_path = "test_input1.txt";
-  auto input2_file_path = "test_input2.txt";
-  auto output_grad_file_path = "test_output_grad.txt";
-  initialize_tensor_from_file(input1_file_path, dense_input1, ff, "float", 3);
-  initialize_tensor_from_file(input2_file_path, dense_input2, ff, "float", 3);
-  initialize_tensor_gradient_from_file(
-      output_grad_file_path, batch_matmul_ret, ff, "float", 3);
-  // run forward and backward to produce results
-  ff.init_layers();
-  ff.forward();
-  ff.backward();
-  // dump results to file for python validation
-  dump_region_to_file(ff, batch_matmul_ret.region, "output.txt", 3);
-  dump_region_to_file(ff, dense_input1.region_grad, "input1_grad.txt", 3);
-  dump_region_to_file(ff, dense_input2.region_grad, "input2_grad.txt", 3);
-}
-
-void register_custom_tasks() {
-  // std::cout <<
-  // static_cast<std::underlying_type<TaskIDs>::type>(ZERO_INIT_TASK_ID) <<
-  // std::endl;
-  {
-    TaskVariantRegistrar registrar(INIT_TENSOR_2D_FROM_FILE_CPU_TASK,
-                                   "Load 2d Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<initialize_tensor_from_file_task<2>>(
-        registrar, "Load 2d tensor Task");
-  }
-  {
-    TaskVariantRegistrar registrar(INIT_TENSOR_3D_FROM_FILE_CPU_TASK,
-                                   "Load 3d Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<initialize_tensor_from_file_task<3>>(
-        registrar, "Load 3d tensor Task");
-  }
-  {
-    TaskVariantRegistrar registrar(INIT_TENSOR_4D_FROM_FILE_CPU_TASK,
-                                   "Load 4d Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<initialize_tensor_from_file_task<4>>(
-        registrar, "Load 4d tensor Task");
-  }
-
-  {
-    TaskVariantRegistrar registrar(DUMP_TENSOR_2D_CPU_TASK, "Compare Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<dump_tensor_task<2>>(
-        registrar, "Compare Tensor Task");
-  }
-  {
-    TaskVariantRegistrar registrar(DUMP_TENSOR_4D_CPU_TASK, "Compare Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<dump_tensor_task<4>>(
-        registrar, "Compare Tensor Task");
-  }
-  {
-    TaskVariantRegistrar registrar(DUMP_TENSOR_3D_CPU_TASK, "Compare Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<dump_tensor_task<3>>(
-        registrar, "Compare Tensor Task");
-  }
-}
diff --git a/tests/ops/batch_matmul_test.cu b/tests/ops/batch_matmul_test.cu
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/tests/ops/concat_test.cc b/tests/ops/concat_test.cc
deleted file mode 100644
index c67b718e0e..0000000000
--- a/tests/ops/concat_test.cc
+++ /dev/null
@@ -1,168 +0,0 @@
-#include "model.h"
-#include "test_utils.h"
-#include <fstream>
-#include <iomanip>
-#include <iostream>
-#include <sstream>
-using namespace Legion;
-LegionRuntime::Logger::Category log_app("concat_test");
-
-struct ConcatTestMeta {
-  int batch_size, i_dim, num_channels, projected_num_channels,
-      dense_projection_i_dim;
-  ConcatTestMeta(int _batch_size,
-                 int _i_dim,
-                 int _num_channels,
-                 int _projected_num_channels,
-                 int _dense_projection_i_dim) {
-    batch_size = _batch_size, num_channels = _num_channels, i_dim = _i_dim,
-    projected_num_channels = _projected_num_channels,
-    dense_projection_i_dim = _dense_projection_i_dim;
-  }
-};
-
-ConcatTestMeta get_test_meta(const std::string file_path) {
-  std::fstream myfile(file_path, std::ios_base::in);
-  int batch_size, i_dim, num_channels, projected_num_channels,
-      dense_projection_i_dim;
-  myfile >> batch_size >> i_dim >> num_channels >> projected_num_channels >>
-      dense_projection_i_dim;
-  return ConcatTestMeta(batch_size,
-                        i_dim,
-                        num_channels,
-                        projected_num_channels,
-                        dense_projection_i_dim);
-}
-
-void top_level_task(Task const *task,
-                    std::vector<PhysicalRegion> const &regions,
-                    Context ctx,
-                    Runtime *runtime) {
-  std::cout << "test framework launched" << std::endl;
-  auto test_meta = get_test_meta("test_meta.txt");
-  FFConfig ffConfig;
-  // create ff model object
-  FFModel ff(ffConfig);
-
-  // create embeddings
-  int dense_embedding_channels = test_meta.num_channels / 2;
-  int sparse_embedding_channels =
-      test_meta.num_channels - dense_embedding_channels;
-  auto dense_embedding_file_path = "test_input2.txt";
-  auto sparse_embedding_file_path = "test_input3.txt";
-  Tensor dense_embeddings[dense_embedding_channels];
-  Tensor sparse_embeddings[sparse_embedding_channels];
-  for (int i = 0; i < dense_embedding_channels; i++) {
-    int const dims[2] = {test_meta.batch_size, test_meta.i_dim};
-    dense_embeddings[i] = ff.create_tensor<2>(dims, "", DT_FLOAT);
-    // init tensor is checked, nothing wrong in init tensor
-    // dense_embeddings[i] also checked, it's correct
-    initialize_tensor_from_file(
-        dense_embedding_file_path, dense_embeddings[i], ff, "float", 2);
-  }
-
-  for (int i = 0; i < sparse_embedding_channels; i++) {
-    int const dims[2] = {test_meta.batch_size, test_meta.i_dim};
-    sparse_embeddings[i] = ff.create_tensor<2>(dims, "", DT_FLOAT);
-    // init tensor is checked, nothing wrong in init tensor
-    // sparse_embeddings[i] also checked, it's correct
-    initialize_tensor_from_file(
-        sparse_embedding_file_path, sparse_embeddings[i], ff, "float", 2);
-    // std::ostringstream stringStream;
-    // stringStream << "sparse_embedding" << i << "_output.txt";
-    // std::string copyOfStr = stringStream.str();
-    // dump_region_to_file(ff, sparse_embeddings[i].region, copyOfStr, 2);
-  }
-
-  // merge two embedding lists
-  std::vector<Tensor> dense_embeddings_v(
-      dense_embeddings, dense_embeddings + dense_embedding_channels);
-  std::vector<Tensor> sparse_embeddings_v(
-      sparse_embeddings, sparse_embeddings + sparse_embedding_channels);
-  std::vector<Tensor> embeddings;
-  embeddings.insert(embeddings.begin(),
-                    sparse_embeddings_v.begin(),
-                    sparse_embeddings_v.end());
-  embeddings.insert(
-      embeddings.end(), dense_embeddings_v.begin(), dense_embeddings_v.end());
-
-  auto ret =
-      ff.concat("concat_input", test_meta.num_channels, &embeddings[0], 1);
-
-  // load inputs tensors and output gradients tensors for testing
-  // use output for output grad (testing only)
-  auto output_grad_file_path = "test_output_grad.txt";
-  initialize_tensor_gradient_from_file(
-      output_grad_file_path, ret, ff, "float", 2);
-
-  ff.optimizer = new SGDOptimizer(&ff, 0.01f);
-  // run forward and backward to produce results
-  ff.init_layers();
-  ff.forward();
-  // dump results to file for python validation
-  dump_region_to_file(ff, ret.region, "output.txt", 2);
-}
-
-void register_custom_tasks() {
-  {
-    TaskVariantRegistrar registrar(INIT_TENSOR_1D_FROM_FILE_CPU_TASK,
-                                   "Load 1d Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<initialize_tensor_from_file_task<1>>(
-        registrar, "Load 1d tensor Task");
-  }
-  {
-    TaskVariantRegistrar registrar(INIT_TENSOR_2D_FROM_FILE_CPU_TASK,
-                                   "Load 2d Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<initialize_tensor_from_file_task<2>>(
-        registrar, "Load 2d tensor Task");
-  }
-  {
-    TaskVariantRegistrar registrar(INIT_TENSOR_3D_FROM_FILE_CPU_TASK,
-                                   "Load 3d Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<initialize_tensor_from_file_task<3>>(
-        registrar, "Load 3d tensor Task");
-  }
-  {
-    TaskVariantRegistrar registrar(INIT_TENSOR_4D_FROM_FILE_CPU_TASK,
-                                   "Load 4d Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<initialize_tensor_from_file_task<4>>(
-        registrar, "Load 4d tensor Task");
-  }
-
-  {
-    TaskVariantRegistrar registrar(DUMP_TENSOR_1D_CPU_TASK, "Dump Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<dump_tensor_task<1>>(registrar,
-                                                           "Dump Tensor Task");
-  }
-  {
-    TaskVariantRegistrar registrar(DUMP_TENSOR_2D_CPU_TASK, "Dump Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<dump_tensor_task<2>>(registrar,
-                                                           "Dump Tensor Task");
-  }
-  {
-    TaskVariantRegistrar registrar(DUMP_TENSOR_4D_CPU_TASK, "Dump Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<dump_tensor_task<4>>(registrar,
-                                                           "Dump Tensor Task");
-  }
-  {
-    TaskVariantRegistrar registrar(DUMP_TENSOR_3D_CPU_TASK, "Dump Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<dump_tensor_task<3>>(registrar,
-                                                           "Dump Tensor Task");
-  }
-}
diff --git a/tests/ops/concat_test.cu b/tests/ops/concat_test.cu
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/tests/ops/flat_test.cc b/tests/ops/flat_test.cc
deleted file mode 100644
index 428893a0dc..0000000000
--- a/tests/ops/flat_test.cc
+++ /dev/null
@@ -1,143 +0,0 @@
-#include "model.h"
-// #include "test_utils.h"
-#include "test_utils.h"
-#include <fstream>
-#include <iomanip>
-#include <iostream>
-#include <sstream>
-
-using namespace Legion;
-LegionRuntime::Logger::Category log_app("Flat_test");
-
-struct FlatTestMeta {
-  int i_dim, o_dim;
-  int *i_shape;
-  int *o_shape;
-  FlatTestMeta(int _i_dim, int _o_dim, int *_i_shape, int *_o_shape) {
-    i_dim = _i_dim;
-    o_dim = _o_dim;
-    i_shape = _i_shape;
-    o_shape = _o_shape;
-  }
-};
-
-FlatTestMeta get_test_meta(const std::string file_path) {
-  std::fstream myfile(file_path, std::ios_base::in);
-  int b;
-  std::vector<int> buffer;
-  while (myfile >> b) {
-    buffer.push_back(b);
-  }
-  int i_dim(buffer[0]), o_dim(buffer[1]);
-  int *i_shape = new int[i_dim];
-  int *o_shape = new int[o_dim];
-  int offset = 2;
-  for (int i = 0; i < i_dim; i++) {
-    i_shape[i] = buffer[i + offset];
-  }
-  offset += i_dim;
-  for (int i = 0; i < o_dim; i++) {
-    o_shape[i] = buffer[i + offset];
-  }
-  // int m,k,d;
-  // myfile >> m >> k >> d;
-  return FlatTestMeta(i_dim, o_dim, i_shape, o_shape);
-}
-
-void top_level_task(Task const *task,
-                    std::vector<PhysicalRegion> const &regions,
-                    Context ctx,
-                    Runtime *runtime) {
-  // std::cout<< "test framework launched" << std::endl;
-  auto test_meta = get_test_meta("test_meta.txt");
-  FFConfig ffConfig;
-  // create ff model object
-  FFModel ff(ffConfig);
-  Tensor dense_input;
-#define input_dim 3
-  int const i_dims[input_dim] = {
-      test_meta.i_shape[0], test_meta.i_shape[1], test_meta.i_shape[2]
-      // test_meta.i_shape[3]
-  };
-  // std::cout << test_meta.i_shape[0] << test_meta.i_shape[1] <<
-  // test_meta.i_shape[2] << test_meta.i_shape[3] <<  std::endl;
-  dense_input = ff.create_tensor<input_dim>(i_dims, "", DT_FLOAT);
-  Tensor ret = ff.flat("", dense_input);
-  auto input1_file_path = "test_input1.txt";
-  auto output_grad_file_path = "test_output_grad.txt";
-  initialize_tensor_from_file(input1_file_path, dense_input, ff, "float", 3);
-  initialize_tensor_gradient_from_file(
-      output_grad_file_path, ret, ff, "float", 2);
-  // run forward and backward to produce results
-  ff.init_layers();
-  // forward
-  ff.forward();
-  dump_region_to_file(ff, ret.region, "output.txt", 2);
-
-  ff.backward();
-  dump_region_to_file(ff, dense_input.region_grad, "input1_grad.txt", 3);
-}
-
-void register_custom_tasks() {
-  {
-    TaskVariantRegistrar registrar(INIT_TENSOR_1D_FROM_FILE_CPU_TASK,
-                                   "Load 1d Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<initialize_tensor_from_file_task<1>>(
-        registrar, "Load 1d tensor Task");
-  }
-  {
-    TaskVariantRegistrar registrar(INIT_TENSOR_2D_FROM_FILE_CPU_TASK,
-                                   "Load 2d Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<initialize_tensor_from_file_task<2>>(
-        registrar, "Load 2d tensor Task");
-  }
-  {
-    TaskVariantRegistrar registrar(INIT_TENSOR_3D_FROM_FILE_CPU_TASK,
-                                   "Load 3d Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<initialize_tensor_from_file_task<3>>(
-        registrar, "Load 3d tensor Task");
-  }
-  {
-    TaskVariantRegistrar registrar(INIT_TENSOR_4D_FROM_FILE_CPU_TASK,
-                                   "Load 4d Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<initialize_tensor_from_file_task<4>>(
-        registrar, "Load 4d tensor Task");
-  }
-
-  {
-    TaskVariantRegistrar registrar(DUMP_TENSOR_1D_CPU_TASK, "Dump Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<dump_tensor_task<1>>(registrar,
-                                                           "Dump Tensor Task");
-  }
-  {
-    TaskVariantRegistrar registrar(DUMP_TENSOR_2D_CPU_TASK, "Dump Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<dump_tensor_task<2>>(registrar,
-                                                           "Dump Tensor Task");
-  }
-  {
-    TaskVariantRegistrar registrar(DUMP_TENSOR_4D_CPU_TASK, "Dump Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<dump_tensor_task<4>>(registrar,
-                                                           "Dump Tensor Task");
-  }
-  {
-    TaskVariantRegistrar registrar(DUMP_TENSOR_3D_CPU_TASK, "Dump Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<dump_tensor_task<3>>(registrar,
-                                                           "Dump Tensor Task");
-  }
-}
diff --git a/tests/ops/flat_test.cu b/tests/ops/flat_test.cu
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/tests/ops/linear_test.cc b/tests/ops/linear_test.cc
deleted file mode 100644
index 5b65de3a56..0000000000
--- a/tests/ops/linear_test.cc
+++ /dev/null
@@ -1,187 +0,0 @@
-#include "model.h"
-#include "test_utils.h"
-#include <fstream>
-#include <iomanip>
-#include <iostream>
-#include <sstream>
-using namespace Legion;
-LegionRuntime::Logger::Category log_app("linear_test");
-
-struct LinearTestMeta {
-  int batch_size, i_dim, num_channels, dense_projection_o_dim,
-      dense_projection_i_dim;
-  LinearTestMeta(int _batch_size,
-                 int _i_dim,
-                 int _num_channels,
-                 int _dense_projection_o_dim,
-                 int _dense_projection_i_dim) {
-    batch_size = _batch_size, num_channels = _num_channels, i_dim = _i_dim,
-    dense_projection_o_dim = _dense_projection_o_dim,
-    dense_projection_i_dim = _dense_projection_i_dim;
-  }
-};
-
-LinearTestMeta get_test_meta(const std::string file_path) {
-  std::fstream myfile(file_path, std::ios_base::in);
-  int batch_size, i_dim, num_channels, dense_projection_o_dim,
-      dense_projection_i_dim;
-  myfile >> batch_size >> i_dim >> num_channels >> dense_projection_o_dim >>
-      dense_projection_i_dim;
-  return LinearTestMeta(batch_size,
-                        i_dim,
-                        num_channels,
-                        dense_projection_o_dim,
-                        dense_projection_i_dim);
-}
-
-void top_level_task(Task const *task,
-                    std::vector<PhysicalRegion> const &regions,
-                    Context ctx,
-                    Runtime *runtime) {
-  std::cout << "test framework launched" << std::endl;
-  auto test_meta = get_test_meta("test_meta.txt");
-  FFConfig ffConfig;
-  // create ff model object
-  FFModel ff(ffConfig);
-  IndexSpace task_is = IndexSpaceT<2>(ff.get_or_create_task_is(2, ""));
-  Initializer *kernel_initializer = new ZeroInitializer();
-  Initializer *bias_initializer = new ZeroInitializer();
-  Tensor weights;
-  {
-    int const dims[2] = {test_meta.dense_projection_o_dim,
-                         test_meta.dense_projection_i_dim};
-    weights = ff.create_linear_weight<2>(
-        dims, (IndexSpaceT<2>)task_is, DT_FLOAT, kernel_initializer);
-    auto weights_file_path = "test_kernel1.txt";
-    initialize_tensor_from_file(weights_file_path, weights, ff, "float", 2);
-  }
-  Tensor bias;
-  {
-    int const dims[1] = {test_meta.dense_projection_o_dim};
-    bias = ff.create_linear_weight<1>(
-        dims, (IndexSpaceT<2>)task_is, DT_FLOAT, bias_initializer);
-    auto bias_file_path = "test_bias1.txt";
-    initialize_tensor_from_file(bias_file_path, bias, ff, "float", 1);
-  }
-
-  auto dense_projection_file_path = "test_input1.txt";
-
-  // create dense projection
-  Tensor dense_projection;
-  {
-    int const dims[2] = {test_meta.batch_size,
-                         test_meta.dense_projection_i_dim};
-    dense_projection = ff.create_tensor<2>(dims, "", DT_FLOAT);
-    // dense_projection = ff.create_linear_weight<2>(dims,
-    // (IndexSpaceT<2>)task_is, DT_FLOAT, kernel_initializer);
-    initialize_tensor_from_file(
-        dense_projection_file_path, dense_projection, ff, "float", 2);
-  }
-
-  auto output_grad_file_path = "test_output_grad.txt";
-
-  // build transpose layer
-  Tensor ret = ff.dense("",
-                        dense_projection,
-                        test_meta.dense_projection_o_dim,
-                        AC_MODE_NONE,
-                        true,
-                        NULL,
-                        NULL,
-                        &weights,
-                        NULL);
-  // init gradient
-  initialize_tensor_gradient_from_file(
-      output_grad_file_path, ret, ff, "float", 2);
-
-  /*
-  TODO
-  1. mid size problem kernels dont match
-  2. test linear consistency with large problems
-     becasue we don't know if SGD perform consistently
-  */
-  ff.optimizer = new SGDOptimizer(&ff, 0.01f, 0.0f);
-  // run forward and backward to produce results
-  ff.init_layers();
-  int epochs = 1;
-  ff.forward();
-  for (int i = 0; i < epochs; i++) {
-    ff.backward();
-    ff.update();
-  }
-
-  initialize_tensor_from_file(
-      dense_projection_file_path, dense_projection, ff, "float", 2);
-  ff.forward();
-  // dump results to file for python validation
-  dump_region_to_file(ff, ret.region, "output.txt", 2);
-  // dump_region_to_file(ff, dense_projection.region, "dump.txt", 2);
-  auto kernel = ff.parameters[0].tensor;
-  dump_region_to_file(ff, kernel.region, "kernel_updated1.txt", 2);
-  // kernel = ff.parameters[1].tensor;
-  // dump_region_to_file(ff, kernel.region_grad, "kernel_grad2.txt", 1);
-}
-
-void register_custom_tasks() {
-  {
-    TaskVariantRegistrar registrar(INIT_TENSOR_1D_FROM_FILE_CPU_TASK,
-                                   "Load 1d Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<initialize_tensor_from_file_task<1>>(
-        registrar, "Load 1d tensor Task");
-  }
-  {
-    TaskVariantRegistrar registrar(INIT_TENSOR_2D_FROM_FILE_CPU_TASK,
-                                   "Load 2d Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<initialize_tensor_from_file_task<2>>(
-        registrar, "Load 2d tensor Task");
-  }
-  {
-    TaskVariantRegistrar registrar(INIT_TENSOR_3D_FROM_FILE_CPU_TASK,
-                                   "Load 3d Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<initialize_tensor_from_file_task<3>>(
-        registrar, "Load 3d tensor Task");
-  }
-  {
-    TaskVariantRegistrar registrar(INIT_TENSOR_4D_FROM_FILE_CPU_TASK,
-                                   "Load 4d Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<initialize_tensor_from_file_task<4>>(
-        registrar, "Load 4d tensor Task");
-  }
-
-  {
-    TaskVariantRegistrar registrar(DUMP_TENSOR_1D_CPU_TASK, "Dump Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<dump_tensor_task<1>>(registrar,
-                                                           "Dump Tensor Task");
-  }
-  {
-    TaskVariantRegistrar registrar(DUMP_TENSOR_2D_CPU_TASK, "Dump Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<dump_tensor_task<2>>(registrar,
-                                                           "Dump Tensor Task");
-  }
-  {
-    TaskVariantRegistrar registrar(DUMP_TENSOR_4D_CPU_TASK, "Dump Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<dump_tensor_task<4>>(registrar,
-                                                           "Dump Tensor Task");
-  }
-  {
-    TaskVariantRegistrar registrar(DUMP_TENSOR_3D_CPU_TASK, "Dump Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<dump_tensor_task<3>>(registrar,
-                                                           "Dump Tensor Task");
-  }
-}
diff --git a/tests/ops/linear_test.cu b/tests/ops/linear_test.cu
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/tests/ops/reshape_test.cc b/tests/ops/reshape_test.cc
deleted file mode 100644
index e8f4586b23..0000000000
--- a/tests/ops/reshape_test.cc
+++ /dev/null
@@ -1,179 +0,0 @@
-#include "model.h"
-#include "test_utils.h"
-#include <fstream>
-#include <iomanip>
-#include <iostream>
-#include <sstream>
-#define PRECISION 16
-using namespace Legion;
-LegionRuntime::Logger::Category log_app("Reshape_test");
-
-struct ReshapeTestMeta {
-  int i_dim, o_dim;
-  int *i_shape;
-  int *o_shape;
-  ReshapeTestMeta(int _i_dim, int _o_dim, int *_i_shape, int *_o_shape) {
-    i_dim = _i_dim;
-    o_dim = _o_dim;
-    i_shape = _i_shape;
-    o_shape = _o_shape;
-  }
-};
-
-ReshapeTestMeta get_test_meta(const std::string file_path) {
-  std::fstream myfile(file_path, std::ios_base::in);
-  int b;
-  std::vector<int> buffer;
-  while (myfile >> b) {
-    buffer.push_back(b);
-  }
-  int i_dim(buffer[0]), o_dim(buffer[1]);
-  int *i_shape = new int[i_dim];
-  int *o_shape = new int[o_dim];
-  int offset = 2;
-  for (int i = 0; i < i_dim; i++) {
-    i_shape[i] = buffer[i + offset];
-  }
-  offset += i_dim;
-  for (int i = 0; i < o_dim; i++) {
-    o_shape[i] = buffer[i + offset];
-  }
-  // int m,k,d;
-  // myfile >> m >> k >> d;
-  return ReshapeTestMeta(i_dim, o_dim, i_shape, o_shape);
-}
-
-void top_level_task(Task const *task,
-                    std::vector<PhysicalRegion> const &regions,
-                    Context ctx,
-                    Runtime *runtime) {
-  // std::cout<< "test framework launched" << std::endl;
-  auto test_meta = get_test_meta("test_meta.txt");
-  FFConfig ffConfig;
-  // create ff model object
-  FFModel ff(ffConfig);
-  Tensor dense_input;
-  if (test_meta.i_dim == 3 && test_meta.o_dim == 2) {
-#define input_dim 3
-#define output_dim 2
-    int const i_dims[input_dim] = {
-        test_meta.i_shape[0], test_meta.i_shape[1], test_meta.i_shape[2]};
-    int const o_shape[output_dim] = {
-        test_meta.o_shape[0],
-        test_meta.o_shape[1],
-    };
-    dense_input = ff.create_tensor<input_dim>(i_dims, "", DT_FLOAT);
-    Tensor ret = ff.reshape<input_dim, output_dim>("", dense_input, o_shape);
-    auto input1_file_path = "test_input1.txt";
-    auto output_grad_file_path = "test_output_grad.txt";
-    initialize_tensor_from_file(
-        input1_file_path, dense_input, ff, "float", input_dim);
-    initialize_tensor_gradient_from_file(
-        output_grad_file_path, ret, ff, "float", output_dim);
-    // run forward and backward to produce results
-    ff.init_layers();
-    // forward
-    ff.forward();
-    dump_region_to_file(ff, ret.region, "output.txt", output_dim);
-    ff.backward();
-    dump_region_to_file(
-        ff, dense_input.region_grad, "input1_grad.txt", input_dim);
-#undef input_dim
-#undef output_dim
-  } else if (test_meta.i_dim == 2 && test_meta.o_dim == 3) {
-#define input_dim 2
-#define output_dim 3
-    int const i_dims[input_dim] = {
-        test_meta.i_shape[0],
-        test_meta.i_shape[1],
-    };
-    int const o_shape[output_dim] = {
-        test_meta.o_shape[0], test_meta.o_shape[1], test_meta.o_shape[2]};
-    dense_input = ff.create_tensor<input_dim>(i_dims, "", DT_FLOAT);
-    Tensor ret = ff.reshape<input_dim, output_dim>("", dense_input, o_shape);
-    auto input1_file_path = "test_input1.txt";
-    auto output_grad_file_path = "test_output_grad.txt";
-    initialize_tensor_from_file(
-        input1_file_path, dense_input, ff, "float", input_dim);
-    initialize_tensor_gradient_from_file(
-        output_grad_file_path, ret, ff, "float", output_dim);
-    // run forward and backward to produce results
-    ff.init_layers();
-    // forward
-    ff.forward();
-    dump_region_to_file(ff, ret.region, "output.txt", output_dim);
-    ff.backward();
-    dump_region_to_file(
-        ff, dense_input.region_grad, "input1_grad.txt", input_dim);
-#undef input_dim
-#undef output_dim
-  } else {
-    printf(
-        "i_dim %d o_dim %d not supported\n", test_meta.i_dim, test_meta.o_dim);
-    throw 255;
-  }
-}
-
-void register_custom_tasks() {
-  {
-    TaskVariantRegistrar registrar(INIT_TENSOR_1D_FROM_FILE_CPU_TASK,
-                                   "Load 1d Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<initialize_tensor_from_file_task<1>>(
-        registrar, "Load 1d tensor Task");
-  }
-  {
-    TaskVariantRegistrar registrar(INIT_TENSOR_2D_FROM_FILE_CPU_TASK,
-                                   "Load 2d Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<initialize_tensor_from_file_task<2>>(
-        registrar, "Load 2d tensor Task");
-  }
-  {
-    TaskVariantRegistrar registrar(INIT_TENSOR_3D_FROM_FILE_CPU_TASK,
-                                   "Load 3d Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<initialize_tensor_from_file_task<3>>(
-        registrar, "Load 3d tensor Task");
-  }
-  {
-    TaskVariantRegistrar registrar(INIT_TENSOR_4D_FROM_FILE_CPU_TASK,
-                                   "Load 4d Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<initialize_tensor_from_file_task<4>>(
-        registrar, "Load 4d tensor Task");
-  }
-
-  {
-    TaskVariantRegistrar registrar(DUMP_TENSOR_1D_CPU_TASK, "Dump Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<dump_tensor_task<1>>(registrar,
-                                                           "Dump Tensor Task");
-  }
-  {
-    TaskVariantRegistrar registrar(DUMP_TENSOR_2D_CPU_TASK, "Dump Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<dump_tensor_task<2>>(registrar,
-                                                           "Dump Tensor Task");
-  }
-  {
-    TaskVariantRegistrar registrar(DUMP_TENSOR_4D_CPU_TASK, "Dump Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<dump_tensor_task<4>>(registrar,
-                                                           "Dump Tensor Task");
-  }
-  {
-    TaskVariantRegistrar registrar(DUMP_TENSOR_3D_CPU_TASK, "Dump Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<dump_tensor_task<3>>(registrar,
-                                                           "Dump Tensor Task");
-  }
-}
diff --git a/tests/ops/reshape_test.cu b/tests/ops/reshape_test.cu
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/tests/ops/tanh_test.cc b/tests/ops/tanh_test.cc
deleted file mode 100644
index 1c24d96aaf..0000000000
--- a/tests/ops/tanh_test.cc
+++ /dev/null
@@ -1,191 +0,0 @@
-#include "model.h"
-#include "test_utils.h"
-#include <fstream>
-#include <iomanip>
-#include <iostream>
-#include <sstream>
-#define PRECISION 16
-using namespace Legion;
-LegionRuntime::Logger::Category log_app("Tanh_test");
-
-struct TanhTestMeta {
-  int i_dim, o_dim;
-  int *i_shape;
-  int *o_shape;
-  TanhTestMeta(int _i_dim, int _o_dim, int *_i_shape, int *_o_shape) {
-    i_dim = _i_dim;
-    o_dim = _o_dim;
-    i_shape = _i_shape;
-    o_shape = _o_shape;
-  }
-};
-
-TanhTestMeta get_test_meta(const std::string file_path) {
-  std::fstream myfile(file_path, std::ios_base::in);
-  int b;
-  std::vector<int> buffer;
-  while (myfile >> b) {
-    buffer.push_back(b);
-  }
-  int i_dim(buffer[0]), o_dim(buffer[1]);
-  int *i_shape = new int[i_dim];
-  int *o_shape = new int[o_dim];
-  int offset = 2;
-  for (int i = 0; i < i_dim; i++) {
-    i_shape[i] = buffer[i + offset];
-  }
-  offset += i_dim;
-  for (int i = 0; i < o_dim; i++) {
-    o_shape[i] = buffer[i + offset];
-  }
-  // int m,k,d;
-  // myfile >> m >> k >> d;
-  return TanhTestMeta(i_dim, o_dim, i_shape, o_shape);
-}
-
-void top_level_task(Task const *task,
-                    std::vector<PhysicalRegion> const &regions,
-                    Context ctx,
-                    Runtime *runtime) {
-  // std::cout<< "test framework launched" << std::endl;
-  auto test_meta = get_test_meta("test_meta.txt");
-  FFConfig ffConfig;
-  // create ff model object
-  FFModel ff(ffConfig);
-  Tensor dense_input;
-  if (test_meta.i_dim == 3) {
-#define input_dim 3
-    int const i_dims[input_dim] = {
-        test_meta.i_shape[0], test_meta.i_shape[1], test_meta.i_shape[2]};
-    dense_input = ff.create_tensor<input_dim>(i_dims, "", DT_FLOAT);
-    Tensor ret = ff.tanh<input_dim>("", dense_input, i_dims);
-    auto input1_file_path = "test_input1.txt";
-    auto output_grad_file_path = "test_output_grad.txt";
-    initialize_tensor_from_file(
-        input1_file_path, dense_input, ff, "float", input_dim);
-    initialize_tensor_gradient_from_file(
-        output_grad_file_path, ret, ff, "float", input_dim);
-    // run forward and backward to produce results
-    ff.init_layers();
-    // forward
-    ff.forward();
-    dump_region_to_file(ff, ret.region, "output.txt", input_dim);
-    ff.backward();
-    dump_region_to_file(
-        ff, dense_input.region_grad, "input1_grad.txt", input_dim);
-#undef input_dim
-  } else if (test_meta.i_dim == 2) {
-#define input_dim 2
-    int const i_dims[input_dim] = {
-        test_meta.i_shape[0],
-        test_meta.i_shape[1],
-    };
-    dense_input = ff.create_tensor<input_dim>(i_dims, "", DT_FLOAT);
-    Tensor ret = ff.tanh<input_dim>("", dense_input, i_dims);
-    auto input1_file_path = "test_input1.txt";
-    auto output_grad_file_path = "test_output_grad.txt";
-    initialize_tensor_from_file(
-        input1_file_path, dense_input, ff, "float", input_dim);
-    initialize_tensor_gradient_from_file(
-        output_grad_file_path, ret, ff, "float", input_dim);
-    // run forward and backward to produce results
-    ff.init_layers();
-    // forward
-    ff.forward();
-    dump_region_to_file(ff, ret.region, "output.txt", input_dim);
-    ff.backward();
-    dump_region_to_file(
-        ff, dense_input.region_grad, "input1_grad.txt", input_dim);
-#undef input_dim
-  } else if (test_meta.i_dim == 1) {
-#define input_dim 1
-    int const i_dims[input_dim] = {test_meta.i_shape[0]};
-    dense_input = ff.create_tensor<input_dim>(i_dims, "", DT_FLOAT);
-    Tensor ret = ff.tanh<input_dim>("", dense_input, i_dims);
-    auto input1_file_path = "test_input1.txt";
-    auto output_grad_file_path = "test_output_grad.txt";
-    initialize_tensor_from_file(
-        input1_file_path, dense_input, ff, "float", input_dim);
-    initialize_tensor_gradient_from_file(
-        output_grad_file_path, ret, ff, "float", input_dim);
-    // run forward and backward to produce results
-    ff.init_layers();
-    // forward
-    ff.forward();
-    dump_region_to_file(ff, ret.region, "output.txt", input_dim);
-    ff.backward();
-    dump_region_to_file(
-        ff, dense_input.region_grad, "input1_grad.txt", input_dim);
-#undef input_dim
-  }
-
-  else {
-    printf(
-        "i_dim %d o_dim %d not supported\n", test_meta.i_dim, test_meta.o_dim);
-    throw 255;
-  }
-}
-
-void register_custom_tasks() {
-  {
-    TaskVariantRegistrar registrar(INIT_TENSOR_1D_FROM_FILE_CPU_TASK,
-                                   "Load 1d Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<initialize_tensor_from_file_task<1>>(
-        registrar, "Load 1d tensor Task");
-  }
-  {
-    TaskVariantRegistrar registrar(INIT_TENSOR_2D_FROM_FILE_CPU_TASK,
-                                   "Load 2d Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<initialize_tensor_from_file_task<2>>(
-        registrar, "Load 2d tensor Task");
-  }
-  {
-    TaskVariantRegistrar registrar(INIT_TENSOR_3D_FROM_FILE_CPU_TASK,
-                                   "Load 3d Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<initialize_tensor_from_file_task<3>>(
-        registrar, "Load 3d tensor Task");
-  }
-  {
-    TaskVariantRegistrar registrar(INIT_TENSOR_4D_FROM_FILE_CPU_TASK,
-                                   "Load 4d Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<initialize_tensor_from_file_task<4>>(
-        registrar, "Load 4d tensor Task");
-  }
-
-  {
-    TaskVariantRegistrar registrar(DUMP_TENSOR_1D_CPU_TASK, "Dump Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<dump_tensor_task<1>>(registrar,
-                                                           "Dump Tensor Task");
-  }
-  {
-    TaskVariantRegistrar registrar(DUMP_TENSOR_2D_CPU_TASK, "Dump Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<dump_tensor_task<2>>(registrar,
-                                                           "Dump Tensor Task");
-  }
-  {
-    TaskVariantRegistrar registrar(DUMP_TENSOR_4D_CPU_TASK, "Dump Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<dump_tensor_task<4>>(registrar,
-                                                           "Dump Tensor Task");
-  }
-  {
-    TaskVariantRegistrar registrar(DUMP_TENSOR_3D_CPU_TASK, "Dump Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<dump_tensor_task<3>>(registrar,
-                                                           "Dump Tensor Task");
-  }
-}
diff --git a/tests/ops/tanh_test.cu b/tests/ops/tanh_test.cu
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/tests/ops/test_bootstrap.sh b/tests/ops/test_bootstrap.sh
deleted file mode 100755
index f3d1750b34..0000000000
--- a/tests/ops/test_bootstrap.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/bash
-# request 8 gpus resource for testing
-srun --nodes=1 --gres=gpu:8 --cpus-per-task=80 --partition=dev --time=30 --pty /bin/bash -l
\ No newline at end of file
diff --git a/tests/ops/test_build_all.sh b/tests/ops/test_build_all.sh
deleted file mode 100755
index a7ba3f97d6..0000000000
--- a/tests/ops/test_build_all.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/bash
-make app="$FF_HOME"/tests/ops/batch_matmul_test -j -f Makefile
-make app="$FF_HOME"/tests/ops/transpose_test -j -f Makefile
-make app="$FF_HOME"/tests/ops/reshape_test -j -f Makefile
-make app="$FF_HOME"/tests/ops/flat_test -j -f Makefile
-make app="$FF_HOME"/tests/ops/tanh_test -j -f Makefile
-make app="$FF_HOME"/tests/ops/concat_test -j -f Makefile
-make app="$FF_HOME"/tests/ops/linear_test -j -f Makefile
diff --git a/tests/ops/test_harness.py b/tests/ops/test_harness.py
deleted file mode 100644
index cdfad46411..0000000000
--- a/tests/ops/test_harness.py
+++ /dev/null
@@ -1,753 +0,0 @@
-import subprocess, time, unittest
-from subprocess import PIPE, STDOUT
-import numpy as np
-import torch
-import torch.optim as optim
-
-
-def dump_numpy_array_to_file(ndarray, file_name):
-    buffer = []
-    for entry in ndarray.flatten():
-      buffer.append(entry)
-    buffer = ["%.6f"%x for x in buffer]
-    with open(file_name, 'w+') as f:
-      f.write(' '.join(buffer))
-
-def dump_torch_tensor_to_file(tensor, file_name):
-    t = tensor.data.cpu().numpy()
-    dump_numpy_array_to_file(t, file_name)
-
-def batch_matmul_3d_reference(input1, input2, trans1, trans2):
-    '''
-    Input layout:
-    input1 (d,k,m)
-    input2 (d,k,n)
-    output (d,m,n)
-    '''
-    input1 = input1.transpose((0,2,1)) if trans1 else input1
-    input2 = input2.transpose((0,2,1)) if trans2 else input2
-    return np.matmul(input1, input2)
-
-def batch_transpose_3d_reference(input):
-    '''
-    This operation transposes the inner 2 dimensions (flip inner two)
-    and assumes tensor outter dimension is sample dimension
-    '''
-    return input.transpose((0,2,1))
-
-def gen_FF_result(test_target, num_gpu):
-    command = 'cd ~/DLRM_FlexFlow/src/ops/tests/ && ./test_run_FF_target.sh %s %s' % (test_target, str(num_gpu))
-    test_process = subprocess.Popen([command], shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT)
-    test_process.stdout.read()
-    test_process.wait()
-
-def is_equal_tensor_from_file(file_1, file_2, label='', epsilon=0.00001):
-    with open(file_1, 'r') as f:
-        input1 = f.readline()
-    with open(file_2, 'r') as f:
-        input2 = f.readline()
-    input1_flat = input1.strip().split(' ')
-    input1_flat = [float(x) for x in input1_flat]
-    input2_flat = input2.strip().split(' ')
-    input2_flat = [float(x) for x in input2_flat]
-    diff_set = set()
-    try:
-      np.testing.assert_allclose(input1_flat, input2_flat, rtol=epsilon, atol=epsilon)
-    except Exception as e:
-      print('checking equal %s failed, error message: %s' % (label, e))
-      raise e 
-
-def average_error_tolerance(file_1, file_2, label='', epsilon=0.00001):
-    with open(file_1, 'r') as f:
-        input1 = f.readline()
-    with open(file_2, 'r') as f:
-        input2 = f.readline()
-    input1_flat = input1.strip().split(' ')
-    input1_flat = [float(x) for x in input1_flat]
-    input2_flat = input2.strip().split(' ')
-    input2_flat = [float(x) for x in input2_flat]
-    input1_flat = np.array(input1_flat)
-    input2_flat = np.array(input2_flat)
-    avg_diff = abs(sum(input1_flat - input2_flat) / len(input1_flat))
-    try:
-      assert(avg_diff < epsilon)
-    except Exception as e:
-      print('checking equal %s failed, error message: %s' % (label, e))
-      raise e   
-
-class Linear(torch.nn.Module):
-    def __init__(self, in_features, out_features, weights=None, bias=False):
-        super(Linear, self).__init__()
-        self.projected_rtc_layer = torch.nn.Linear(in_features, out_features, bias=bias)
-        self.h_channel_dim = out_features
-        if weights is not None:
-            assert len(weights) == 2
-            # weights shape  (out_features,in_features)
-            # bias shape (out_features)
-            self.projected_rtc_layer.weight = torch.nn.Parameter(torch.from_numpy(weights[0]).float(), requires_grad=True)
-            if bias:
-              self.projected_rtc_layer.bias = torch.nn.Parameter(torch.from_numpy(weights[1]).float(), requires_grad=True)
-
-    def forward(self, dense_projection):
-        projected_rtc = self.projected_rtc_layer(dense_projection)
-        return projected_rtc
-
-
-class DotCompressor(torch.nn.Module):
-    def __init__(self, in_features, out_features, weights=None, bias=False):
-        super(DotCompressor, self).__init__()
-        self.projected_rtc_layer = torch.nn.Linear(in_features, out_features, bias=bias)
-        self.h_channel_dim = out_features
-        if weights is not None:
-            assert len(weights) == 2
-            # weights shape  (out_features,in_features)
-            # bias shape (out_features)
-            self.projected_rtc_layer.weight = torch.nn.Parameter(torch.from_numpy(weights[0]).float(), requires_grad=True)
-            if bias:
-              self.projected_rtc_layer.bias = torch.nn.Parameter(torch.from_numpy(weights[1]).float(), requires_grad=True)
-
-    def forward(self, dense_embeddings, sparse_embeddings, dense_projection, debug=False, debuglv2=False):
-        if dense_projection is not None:
-          assert len(dense_projection.shape) == 2
-        assert len(dense_embeddings[0].shape) == 2
-        assert len(sparse_embeddings[0].shape) == 2
-        assert len(dense_embeddings) > 0, 'embeddings to be compressed can not be empty'
-        assert len(sparse_embeddings) > 0, 'embeddings to be compressed can not be empty'
-        num_channels = len(dense_embeddings) + len(sparse_embeddings)
-        batch_size = dense_embeddings[0].shape[0]
-        i_dim = dense_embeddings[0].shape[1]
-        if debug:
-            # print("dense embeddings", dense_embeddings.data.cpu().numpy())
-            print("dense embeddings", dense_embeddings[0].shape, len(dense_embeddings))
-            # print("sparse embeddings", sparse_embeddings.data.cpu().numpy())
-            print("sparse embeddings", sparse_embeddings[0].shape, len(sparse_embeddings))
-        # concat embeddings
-        cat_input = torch.cat(sparse_embeddings+dense_embeddings, dim=1)
-        dump_torch_tensor_to_file(cat_input, "test_layer1_output.txt")
-        if debug:
-            print("concatenated inputs", cat_input.shape)
-        # reshape to add channel dimension
-        cat_input_reshape = cat_input.reshape(batch_size, num_channels, i_dim)
-        dump_torch_tensor_to_file(cat_input_reshape, "test_layer2_output.txt")
-        if debug:
-            print('reshaped input', cat_input_reshape.shape)
-        # transpose
-        transpose_cat = torch.transpose(cat_input_reshape, 2, 1)
-        dump_torch_tensor_to_file(transpose_cat, "test_layer3_output.txt")
-        if debug:
-            print('transposed', transpose_cat.shape)
-        if debuglv2:
-            print('transposed input\n', cat_input_reshape.data.cpu().numpy())
-            print('transposed output\n', transpose_cat.data.cpu().numpy())
-        # reshape 3 to 2
-        batched_input_size = batch_size * i_dim
-        reshape_transpose_cat = torch.reshape(transpose_cat, (batched_input_size, num_channels)).float()
-        dump_torch_tensor_to_file(reshape_transpose_cat, "test_layer4_output.txt")
-        if debug:
-            print('reshaped', reshape_transpose_cat.shape)
-        # linear layer
-        projected_rtc = self.projected_rtc_layer(reshape_transpose_cat)
-        dump_torch_tensor_to_file(projected_rtc, "test_layer5_output.txt")
-        if debug:
-            print('projected batch:', projected_rtc.shape)
-        if debuglv2:
-            print('projected input:', cat_input_reshape.data.cpu().numpy())
-            print('projected kernel:', self.projected_rtc_layer.weight.data.cpu().numpy())
-            print('projected bias:', self.projected_rtc_layer.bias.data.cpu().numpy())
-            print('projected output:', projected_rtc.data.cpu().numpy())
-        # unpack inputs reshape 2 to 3
-        unpacked_projected_rtc = torch.reshape(projected_rtc, (batch_size, i_dim, self.h_channel_dim))
-        dump_torch_tensor_to_file(unpacked_projected_rtc, "test_layer6_output.txt")
-        if debug:
-            print('unpacked_projected_rtc', unpacked_projected_rtc.shape)
-        # bmm
-        batch_pairwise = torch.bmm(transpose_cat.transpose(-1, -2).float(), unpacked_projected_rtc.float())
-        dump_torch_tensor_to_file(batch_pairwise, "test_layer7_output.txt")
-        if debug:
-            print('bmm input1', transpose_cat.shape, 'bmm input2', unpacked_projected_rtc.shape)
-            print('batch_pairwise', batch_pairwise.shape)
-        if debuglv2:
-            print('input1', transpose_cat.data.cpu().numpy())
-            print('input2', unpacked_projected_rtc.data.cpu().numpy())
-            print('output', batch_pairwise.data.cpu().numpy())
-        flattened_pairwise = batch_pairwise.flatten(1, 2)
-        dump_torch_tensor_to_file(flattened_pairwise, "test_layer8_output.txt")
-        if debug:
-            print('flattened_pairwise', flattened_pairwise.shape)
-        tanh_flatteded_pairwise = torch.tanh(flattened_pairwise)
-        if debug:
-            print('tanh_flatteded_pairwise', tanh_flatteded_pairwise.shape)
-        if dense_projection is None:
-          return tanh_flatteded_pairwise
-        else:
-          cat_compression_ret = torch.cat([tanh_flatteded_pairwise, dense_projection], 1)
-          if debug:
-              print('dense_projection', dense_projection.shape)
-          return cat_compression_ret
-
-class LinearTest(unittest.TestCase):
-    TEST_TARGET = 'linear_test'
-    def _dump_meta(self, batch_size, i_dim, num_channels, dense_projection_o_dim, dense_projection_i_dim):
-        with open('test_meta.txt', 'w+') as f:
-          f.write(' '.join(
-            [str(batch_size), str(i_dim), 
-             str(num_channels), str(dense_projection_o_dim),
-             str(dense_projection_i_dim)
-            ])
-          )
-
-
-
-    def _run_gpu_test(self, num_gpu, batch_size, i_dim, \
-                      num_channels, dense_projection_o_dim, \
-                      dense_projection_i_dim, epsilon=0.001, \
-                      l=-1.0, r=1.0, epoch=1):
-        self._dump_meta(batch_size, i_dim, num_channels, dense_projection_o_dim, dense_projection_i_dim)
-        linear_weight = np.random.uniform(l,r, (dense_projection_o_dim, dense_projection_i_dim))
-        dump_numpy_array_to_file(linear_weight, "test_kernel1.txt")
-        linear_bias = np.zeros(dense_projection_o_dim)
-        dump_numpy_array_to_file(linear_bias, "test_bias1.txt")
-
-        dense_projection = np.random.uniform(l, r, (batch_size, dense_projection_i_dim))
-        dense_projection = torch.from_numpy(dense_projection).float()
-        dump_numpy_array_to_file(dense_projection, "test_input1.txt")
-        pretrained_weights = [linear_weight, linear_bias]
-
-        m = Linear(dense_projection_i_dim, dense_projection_o_dim, pretrained_weights, bias=True)
-        ret = m(dense_projection)
-        
-        output_gradiance = np.random.uniform(l,r,ret.data.cpu().numpy().shape)
-        dump_numpy_array_to_file(output_gradiance, "test_output_grad.txt")
-        output_gradiance_t = torch.from_numpy(output_gradiance).float()
-        # use output as grad just for testing 
-
-        optimizer = optim.SGD(m.parameters(), lr=0.01, momentum=0.0)
-        for _ in range(epoch):
-          optimizer.zero_grad()
-          ret.backward(output_gradiance_t, retain_graph=True)
-          optimizer.step()
-        linear_weights_updated = m.projected_rtc_layer.weight.data.numpy()
-        dump_numpy_array_to_file(linear_weights_updated, 'test_kernel_updated1.txt')
-        ret = m(dense_projection)
-        dump_numpy_array_to_file(ret.data.cpu().numpy(), "test_output.txt")
-
-        # generate FF results
-        gen_FF_result(LinearTest.TEST_TARGET, num_gpu)
-
-
-        file1 = 'output.txt'
-        file2 = 'test_output.txt'
-        average_error_tolerance(file1, file2, 'output', epsilon=epsilon)
-        # average_error_tolerance(file1, file2, 'output', epsilon=epsilon)
-        file2 = 'kernel_updated1.txt'
-        file1 = 'test_kernel_updated1.txt'
-        average_error_tolerance(file1, file2, 'kernel', epsilon=epsilon)
-        
-    def test_single_gpu_simple_problem(self):
-        np.random.seed(0)
-        num_gpu = 1
-        batch_size = 2
-        i_dim = 0 # not used
-        num_channels = 4
-        dense_projection_o_dim = 3
-        dense_projection_i_dim = 2
-        self._run_gpu_test(num_gpu, batch_size, i_dim, \
-                           num_channels, dense_projection_o_dim, \
-                           dense_projection_i_dim,)
-
-    def test_multi_gpu_small_small_mid_problem(self):
-        np.random.seed(0)
-        num_gpu = 2
-        # batch_size % num_worker == 0 (reshape contraints)
-        batch_size = 10
-        i_dim = 0 # not used
-        num_channels = 10
-        dense_projection_o_dim = 1000
-        dense_projection_i_dim = 2000      
-        self._run_gpu_test(num_gpu, batch_size, i_dim, \
-                           num_channels, dense_projection_o_dim, \
-                           dense_projection_i_dim)
-
-
-    def test_multi_gpu_small_mid_problem(self):
-        np.random.seed(0)
-        num_gpu = 2
-        # batch_size % num_worker == 0 (reshape contraints)
-        batch_size = 20
-        i_dim = 0 # not used
-        num_channels = 20
-        dense_projection_o_dim = 5000
-        dense_projection_i_dim = 5000    
-        self._run_gpu_test(num_gpu, batch_size, i_dim, \
-                           num_channels, dense_projection_o_dim, \
-                           dense_projection_i_dim)
-
-
-
-
-
-class ConcatTest(unittest.TestCase):
-    TEST_TARGET = 'concat_test'
-    def _dump_meta(self, batch_size, i_dim, num_channels, projected_num_channels, dense_projection_i_dim):
-        with open('test_meta.txt', 'w+') as f:
-          f.write(' '.join(
-            [str(batch_size), str(i_dim), 
-             str(num_channels), str(projected_num_channels),
-             str(dense_projection_i_dim)
-            ])
-          )
-
-    def _run_gpu_test(self, num_gpu, batch_size, i_dim, \
-                      num_channels, projected_num_channels, \
-                      dense_projection_i_dim, epsilon=0.001, \
-                      concat_last=True, l=-1.0, r=1.0, epoch=1):
-        self._dump_meta(batch_size, i_dim, num_channels, projected_num_channels, dense_projection_i_dim)
-        linear_weight = np.random.uniform(l,r, (projected_num_channels, num_channels))
-        dump_numpy_array_to_file(linear_weight, "test_kernel1.txt")
-        linear_bias = np.zeros(projected_num_channels)
-        dump_numpy_array_to_file(linear_bias, "test_bias1.txt")
-
-        dense_projection = np.random.uniform(l, r, (batch_size, dense_projection_i_dim))
-        dense_projection = torch.from_numpy(dense_projection).float()
-        dump_numpy_array_to_file(dense_projection, "test_input1.txt")
-
-        dense_embedding = np.random.uniform(l, r, (batch_size, i_dim))
-        chunk_dense_embedded = [torch.from_numpy(dense_embedding) for _ in range(num_channels // 2)]
-        dump_numpy_array_to_file(dense_embedding, "test_input2.txt")
-
-        sparse_embedding = np.random.uniform(l, r, (batch_size, i_dim))
-        chunk_sparse_embedded = [torch.from_numpy(sparse_embedding) for _ in range(num_channels - num_channels // 2)]
-        dump_numpy_array_to_file(sparse_embedding, "test_input3.txt")
-
-
-        ret = np.concatenate(chunk_sparse_embedded+chunk_dense_embedded, axis=1)
-        dump_numpy_array_to_file(ret, "test_output.txt")
-        output_gradiance = np.random.uniform(l,r,ret.shape)
-        dump_numpy_array_to_file(output_gradiance, "test_output_grad.txt")
-        output_gradiance_t = torch.from_numpy(output_gradiance).float()
-
-        # generate FF results
-        gen_FF_result(ConcatTest.TEST_TARGET, num_gpu)
-
-
-        file1 = 'output.txt'
-        file2 = 'test_output.txt'
-        is_equal_tensor_from_file(file1, file2, 'output', epsilon=epsilon)
-
-    def test_single_gpu_simple_problem(self):
-        np.random.seed(0)
-        num_gpu = 1
-        batch_size = 2
-        i_dim = 6
-        num_channels = 4
-        projected_num_channels = 3
-        dense_projection_i_dim = 2
-        self._run_gpu_test(num_gpu, batch_size, i_dim, \
-                           num_channels, projected_num_channels, \
-                           dense_projection_i_dim, \
-                           concat_last=True)
-
-    def test_multi_gpu_small_working_problem(self):
-        np.random.seed(0)
-        num_gpu = 2
-        # batch_size % num_worker == 0 (reshape contraints)
-        batch_size = 2
-        i_dim = 6
-        num_channels = 2
-        projected_num_channels = 2
-        dense_projection_i_dim = 3        
-        self._run_gpu_test(num_gpu, batch_size, i_dim, \
-                           num_channels, projected_num_channels, \
-                           dense_projection_i_dim, \
-                           concat_last=True, epoch=10)   
-
-    def test_multi_gpu_small_problem(self):
-        np.random.seed(0)
-        num_gpu = 2
-        # batch_size % num_worker == 0 (reshape contraints)
-        batch_size = 4
-        i_dim = 6
-        num_channels = 2
-        projected_num_channels = 2
-        dense_projection_i_dim = 3        
-        self._run_gpu_test(num_gpu, batch_size, i_dim, \
-                           num_channels, projected_num_channels, \
-                           dense_projection_i_dim, \
-                           concat_last=True, epoch=10)   
-
-
-    def test_multi_gpu_small_small_mid_problem(self):
-        np.random.seed(0)
-        num_gpu = 2
-        # batch_size % num_worker == 0 (reshape contraints)
-        batch_size = 4
-        i_dim = 10
-        num_channels = 10
-        projected_num_channels = 6
-        dense_projection_i_dim = 12         
-        self._run_gpu_test(num_gpu, batch_size, i_dim, \
-                           num_channels, projected_num_channels, \
-                           dense_projection_i_dim, \
-                           concat_last=True, epoch=10)      
-
-    def test_single_gpu_small_small_mid_problem(self):
-        np.random.seed(0)
-        num_gpu = 1
-        # batch_size % num_worker == 0 (reshape contraints)
-        batch_size = 4
-        i_dim = 10
-        num_channels = 10
-        projected_num_channels = 6
-        dense_projection_i_dim = 12         
-        self._run_gpu_test(num_gpu, batch_size, i_dim, \
-                           num_channels, projected_num_channels, \
-                           dense_projection_i_dim, \
-                           concat_last=True, epoch=10)      
-
-class BatchMatmulTest(unittest.TestCase):
-    '''
-    BMM default layout
-    input1 (d,m,k)
-    input2 (d,k,n)
-    output (d,m,n)
-
-    Target shape in caffe2
-    input1 (d,k,m)
-    input2 (d,k,n)
-    output (d,m,n)
-    so we set trans1=True and trans2=False
-    '''
-    TEST_TARGET = 'batch_matmul_test'
-    def _dump_meta(self, m,k,n,d):
-        with open('test_meta.txt', 'w+') as f:
-          f.write(' '.join([str(m), str(k), str(n), str(d)]))
-
-    def _run_gpu_test(self, num_gpu, d, m, n, k, epsilon=0.00001):
-        # generate python reference and input payload
-        input1_tensor = np.random.uniform(0, 1, (d,k,m))
-        dump_numpy_array_to_file(input1_tensor, "test_input1.txt")
-        input2_tensor = np.random.uniform(0, 1, (d,k,n))
-        dump_numpy_array_to_file(input2_tensor, "test_input2.txt")
-        output_gradient_tensor = np.random.uniform(0, 1, (d,m,n))
-        dump_numpy_array_to_file(output_gradient_tensor, "test_output_grad.txt")
-
-        output_tensor = batch_matmul_3d_reference(input1_tensor, input2_tensor, trans1=True, trans2=False)
-        input1_grad_tensor = batch_matmul_3d_reference(input2_tensor, output_gradient_tensor, trans1=False, trans2=True)
-        input2_grad_tensor = batch_matmul_3d_reference(input1_tensor, output_gradient_tensor, trans1=False, trans2=False)
-        dump_numpy_array_to_file(output_tensor, "test_output.txt")
-        dump_numpy_array_to_file(input1_grad_tensor, "test_input1_grad.txt")
-        dump_numpy_array_to_file(input2_grad_tensor, "test_input2_grad.txt")
-        self._dump_meta(m,k,n,d)
-
-        # generate FF results
-        gen_FF_result(BatchMatmulTest.TEST_TARGET, num_gpu)
-        file1 = 'output.txt'
-        file2 = 'test_output.txt'
-        ret1 = is_equal_tensor_from_file(file1, file2, 'output', epsilon=epsilon)
-        file1 = 'test_input1_grad.txt'
-        file2 = 'input1_grad.txt'
-        ret2 = is_equal_tensor_from_file(file1, file2, 'input1_grad', epsilon=epsilon)
-        file1 = 'test_input2_grad.txt'
-        file2 = 'input2_grad.txt'
-        ret3 = is_equal_tensor_from_file(file1, file2, 'input2_grad', epsilon=epsilon)
-
-    def test_single_gpu_single_batch(self):
-        # generate test payload
-        d,m,n,k = 1,2,3,4
-        num_gpu = 1
-        self._run_gpu_test(num_gpu, d, m, n, k)
-    
-    def test_single_gpu_multi_batches(self):
-        # generate test payload
-        d,m,n,k = 5,2,3,4
-        num_gpu = 1
-        self._run_gpu_test(num_gpu, d, m, n, k)
-
-    def test_multi_gpus_multi_batches(self):
-        # generate test payload
-        d,m,n,k = 5,2,3,4
-        num_gpu = 2
-        self._run_gpu_test(num_gpu, d, m, n, k)
-
-    # def uneven_distribute_test(self):
-    #     # for this configuration we can't distribute payload to each GPU because
-    #     # ceil(9 / 8) = 2, for each gpu we assign 2 batches, such we only assign payloads to 5 gpus, 3 gpus won't get 
-    #     # any payload, in this scenario FF throws a  `acc.accessor.is_dense_arbitrary(rect)' failed error
-    #     # this error is too deep for user to debug, we need to handle this case in FF 
-    #     # and throw proper exception - so this test should expect a exception
-    #     d,m,n,k = 9,2,3,4
-    #     num_gpu = 8
-    #     self._run_gpu_test(num_gpu, d, m, n, k)
-
-    def test_unit_size_matrix(self):
-        # generate test payload
-        d,m,n,k = 1,1,1,1
-        num_gpu = 1
-        self._run_gpu_test(num_gpu, d, m, n, k)
-    
-    def test_unit_size_matrix(self):
-        # generate test payload
-        d,m,n,k = 2,1,1,1
-        num_gpu = 2
-        self._run_gpu_test(num_gpu, d, m, n, k)
-
-    def test_small_size_matrix2(self):
-        # generate test payload
-        d,m,n,k = 2,2,2,1
-        num_gpu = 2
-        self._run_gpu_test(num_gpu, d, m, n, k)
-
-    def test_multi_gpus_ads_team_target_model_shape(self):
-        # generate test payload
-        d,m,n,k = 145,265,15,64
-        num_gpu = 2
-        ret = self._run_gpu_test(num_gpu, d, m, n, k, epsilon=0.0001)
-
-    def test_single_gpu_ads_team_target_model_shape(self):
-        # generate test payload
-        d,m,n,k = 145,265,15,64
-        num_gpu = 1
-        ret = self._run_gpu_test(num_gpu, d, m, n, k, epsilon=0.0001)
-
-class TransposeTest(unittest.TestCase):
-    '''
-    Transpose shape (d,m,k)
-    '''
-    TEST_TARGET = 'transpose_test'
-    def _dump_meta(self,m,k,d):
-        with open('test_meta.txt', 'w+') as f:
-          f.write(' '.join([str(m), str(k), str(d)]))
-
-    def test_single_gpu_single_batch(self):
-        # generate test payload
-        d,m,k = 1,2,3
-        num_gpu = 1
-        self._run_gpu_test(num_gpu, d, m, k)
-
-    def test_single_gpu_multi_batches(self):
-        d,m,k = 9,2,3
-        num_gpu = 1
-        self._run_gpu_test(num_gpu, d, m, k)
-    
-    def test_unit_batch_matrix(self):
-        d,m,k = 1,1,1
-        num_gpu = 1
-        self._run_gpu_test(num_gpu, d, m, k)
-      
-    def test_multi_gpus_ads_team_target_shape(self):
-        d,m,k = 145, 265, 64
-        num_gpu = 2
-        self._run_gpu_test(num_gpu, d, m, k)
-
-    def test_single_gpu_ads_team_target_shape(self):
-        d,m,k = 145, 265, 64
-        num_gpu = 1
-        self._run_gpu_test(num_gpu, d, m, k)
-
-    def test_multi_gpus_small_problem(self):
-        d,m,k = 2,3,4
-        num_gpu = 2
-        self._run_gpu_test(num_gpu, d, m, k)
-    
-    def uneven_split_multi_gpus_multi_batch(self):
-        d,m,k = 3,4,5
-        num_gpu = 2
-        self._run_gpu_test(num_gpu, d, m, k)
-
-    # # if number_gpu * number_node > batch_size will throw exception
-    # # need to handle this exception in FF and add this unit test later on (to expect an exception)
-    # def test_multi_gpus_single_batch(self):
-    #     d,m,k = 1,2,3
-    #     num_gpu = 2
-    #     ret = self.transpose_test_pipeline(num_gpu, d, m, k)
-
-    def _run_gpu_test(self, num_gpu, d, m, k, epsilon=0.00001):
-        # generate python reference and input payload
-        input_tensor = np.random.uniform(0, 1, (d,m,k))
-        dump_numpy_array_to_file(input_tensor, "test_input1.txt")
-        output_gradient_tensor = np.random.uniform(0, 1, (d,k,m))
-        dump_numpy_array_to_file(output_gradient_tensor, "test_output_grad.txt")
-        output_tensor = batch_transpose_3d_reference(input_tensor)
-        input_grad_tensor = batch_transpose_3d_reference(output_gradient_tensor)
-        dump_numpy_array_to_file(output_tensor, "test_output.txt")
-        dump_numpy_array_to_file(input_grad_tensor, "test_input1_grad.txt")
-        self._dump_meta(m,k,d)
-
-        # generate FF results
-        gen_FF_result(TransposeTest.TEST_TARGET, num_gpu)
-        file1 = 'output.txt'
-        file2 = 'test_output.txt'
-        is_equal_tensor_from_file(file1, file2, 'output', epsilon=epsilon)
-        file1 = 'test_input1_grad.txt'
-        file2 = 'input1_grad.txt'
-        is_equal_tensor_from_file(file1, file2, 'input_grad', epsilon=epsilon)
-
-class ReshapeTest(unittest.TestCase):
-    '''
-    4 dimensional to 2dimensional flatten
-    '''
-    TEST_TARGET = 'reshape_test'
-    def _dump_meta(self,i_dim, o_dim, i_shape, o_shape):
-        i_shape = [str(x) for x in i_shape]
-        o_shape = [str(x) for x in o_shape]
-        with open('test_meta.txt', 'w+') as f:
-          f.write(' '.join([str(i_dim), str(o_dim)]+i_shape+o_shape))
-
-   
-    def _run_gpu_test(self, num_gpu, i_dim, o_dim, i_shape, o_shape, epsilon=0.00001):
-        # generate python reference and input payload
-        input_tensor = np.random.uniform(0, 1, i_shape)
-        dump_numpy_array_to_file(input_tensor, "test_input1.txt")
-        output_gradient_tensor = np.random.uniform(0, 1, o_shape)
-        dump_numpy_array_to_file(output_gradient_tensor, "test_output_grad.txt")
-        output_tensor = input_tensor.reshape(o_shape)
-        input_grad_tensor = output_gradient_tensor.reshape(i_shape)
-        dump_numpy_array_to_file(output_tensor, "test_output.txt")
-        dump_numpy_array_to_file(input_grad_tensor, "test_input1_grad.txt")
-        self._dump_meta(i_dim, o_dim, i_shape, o_shape)
-
-        # generate FF results
-        gen_FF_result(ReshapeTest.TEST_TARGET, num_gpu)
-        file1 = 'output.txt'
-        file2 = 'test_output.txt'
-        is_equal_tensor_from_file(file1, file2, 'output', epsilon=epsilon)
-        file1 = 'test_input1_grad.txt'
-        file2 = 'input1_grad.txt'
-        is_equal_tensor_from_file(file1, file2, 'input_grad', epsilon=epsilon)
-
-      
-    def test_single_gpu_multi_batch_32(self):
-        num_gpu = 1
-        i_dim = 3
-        o_dim = 2
-        i_shape = (3,3,15)
-        o_shape = (9,15)
-        self._run_gpu_test(num_gpu, i_dim, o_dim, i_shape, o_shape)
-
-    def test_multi_gpu_multi_batch_32(self):
-        num_gpu = 2
-        i_dim = 3
-        o_dim = 2
-        i_shape = (2,3,4)
-        o_shape = (6,4)
-        self._run_gpu_test(num_gpu, i_dim, o_dim, i_shape, o_shape)
-
-    def test_problem_size_32(self):
-        num_gpu = 2
-        i_dim = 3
-        o_dim = 2
-        i_shape = (144,64,265)
-        o_shape = (144*64,265)
-        self._run_gpu_test(num_gpu, i_dim, o_dim, i_shape, o_shape)
-
-
-    def test_single_gpu_multi_batch_23(self):
-        num_gpu = 1
-        i_dim = 2
-        o_dim = 3
-        i_shape = (9,15)
-        o_shape = (3,3,15)
-        self._run_gpu_test(num_gpu, i_dim, o_dim, i_shape, o_shape)
-
-    def test_multi_gpu_multi_batch_23(self):
-        num_gpu = 2
-        i_dim = 2
-        o_dim = 3
-        i_shape = (6,2)
-        o_shape = (2,3,2)
-        self._run_gpu_test(num_gpu, i_dim, o_dim, i_shape, o_shape)
-
-    def test_problem_size_23(self):
-        num_gpu = 2
-        i_dim = 2
-        o_dim = 3
-        i_shape = (144*64,265)
-        o_shape = (144,64,265)
-        self._run_gpu_test(num_gpu, i_dim, o_dim, i_shape, o_shape)
-    
-    def test_flatten_inner_2(self):
-        num_gpu = 2
-        i_dim = 3
-        o_dim = 2
-        i_shape = (2,3,2)
-        o_shape = (2,6)
-        self._run_gpu_test(num_gpu, i_dim, o_dim, i_shape, o_shape) 
-      
-    def test_flatten_inner_2_target_size(self):
-        num_gpu = 2
-        i_dim = 3
-        o_dim = 2
-        i_shape = (144,265,15)
-        o_shape = (144,3975)
-        self._run_gpu_test(num_gpu, i_dim, o_dim, i_shape, o_shape)
-
-class TanhTest(unittest.TestCase):
-    '''
-    4 dimensional to 2dimensional flatten
-    '''
-    TEST_TARGET = 'tanh_test'
-    def _dump_meta(self,i_dim, o_dim, i_shape, o_shape):
-        i_shape = [str(x) for x in i_shape]
-        o_shape = [str(x) for x in o_shape]
-        with open('test_meta.txt', 'w+') as f:
-          f.write(' '.join([str(i_dim), str(o_dim)]+i_shape+o_shape))
-
-   
-    def _run_gpu_test(self, num_gpu, i_dim, o_dim, i_shape, epsilon=0.00001):
-        # generate python reference and input payload
-        input_tensor_value = np.random.uniform(0, 1, i_shape)
-        input_tensor = torch.from_numpy(input_tensor_value).float()
-        dump_numpy_array_to_file(input_tensor_value, "test_input1.txt")
-        output_gradient_tensor = np.random.uniform(0, 1, i_shape)
-        dump_numpy_array_to_file(output_gradient_tensor, "test_output_grad.txt")
-        output_tensor = torch.nn.Tanh()(input_tensor).numpy()
-        dump_numpy_array_to_file(output_tensor, "test_output.txt")
-        self._dump_meta(i_dim, o_dim, i_shape, i_shape)
-
-        # generate FF results
-        gen_FF_result(TanhTest.TEST_TARGET, num_gpu)
-        file1 = 'output.txt'
-        file2 = 'test_output.txt'
-        is_equal_tensor_from_file(file1, file2, 'output', epsilon=epsilon)
-
-        # todo: add backward checking
-        # todo: tanh backward
-        # input_grad_tensor = output_gradient_tensor.reshape(i_shape)
-        # dump_numpy_array_to_file(input_grad_tensor, "test_input1_grad.txt")
-        # file1 = 'test_input1_grad.txt'
-        # file2 = 'input1_grad.txt'
-        # is_equal_tensor_from_file(file1, file2, 'input_grad', epsilon=epsilon)
-
-      
-    def test_single_gpu_multi_batch_3d(self):
-        num_gpu = 1
-        i_dim = 3
-        i_shape = (2,3,5)
-        self._run_gpu_test(num_gpu, i_dim, i_dim, i_shape)
-
-    def test_single_gpu_multi_batch_2d(self):
-        num_gpu = 1
-        i_dim = 2
-        i_shape = (2,3)
-        self._run_gpu_test(num_gpu, i_dim, i_dim, i_shape)
-
-    def test_multi_gpu_multi_batch_2d(self):
-        num_gpu = 2
-        i_dim = 2
-        i_shape = (2,3)
-        self._run_gpu_test(num_gpu, i_dim, i_dim, i_shape)
-
-    def test_multi_gpu_target_size(self):
-        num_gpu = 2
-        i_dim = 2
-        i_shape = (145,3975)
-        self._run_gpu_test(num_gpu, i_dim, i_dim, i_shape)
-
-    def test_multi_gpu_multi_batch_3d(self):
-        num_gpu = 2
-        i_dim = 3
-        i_shape = (2,2,2)
-        self._run_gpu_test(num_gpu, i_dim, i_dim, i_shape)
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/tests/ops/test_readme.md b/tests/ops/test_readme.md
deleted file mode 100644
index 2901dbcaab..0000000000
--- a/tests/ops/test_readme.md
+++ /dev/null
@@ -1,8 +0,0 @@
-
-# Flexflow operator unit test
-1. To build test targets:
-  - BatchMatmul: `cd ~/DLRM_FlexFlow && make app=src/ops/tests/batch_matmul_test -j 20 -f Makefile`
-  - Transpose: `cd ~/DLRM_FlexFlow && make app=src/ops/tests/transpose -j 20 -f Makefile`
-2. To run unit test
-  - `cd ~/DLRM_FlexFlow/src/ops/tests/ && ./test_bootstrap.sh` # run bootstrap to request 8 gpus resource
-  - `cd ~/DLRM_FlexFlow/src/ops/tests/ && ./test_run.sh` # run unit tests
\ No newline at end of file
diff --git a/tests/ops/test_run.sh b/tests/ops/test_run.sh
deleted file mode 100755
index 2f45c6770c..0000000000
--- a/tests/ops/test_run.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/bash
-
-# this is to silience MPI warning http://www.open-mpi.org/faq/?category=openfabrics#ofa-default-subnet-gid
-DIR=~/.openmpi
-FILE=~/.openmpi/mca-params.conf
-if [ ! -d "$DIR" ]; then
-  mkdir ~/.openmpi
-fi
-if [ ! -f "$FILE" ]; then
-  touch ~/.openmpi/mca-params.conf
-  echo "btl_openib_warn_default_gid_prefix=0" >> ~/.openmpi/mca-params.conf
-fi
-
-cd ~/DLRM_FlexFlow/src/ops/tests/ && python -m unittest test_harness.TransposeTest
-cd ~/DLRM_FlexFlow/src/ops/tests/ && python -m unittest test_harness.BatchMatmulTest
-cd ~/DLRM_FlexFlow/src/ops/tests/ && python -m unittest test_harness.ReshapeTest
-cd ~/DLRM_FlexFlow/src/ops/tests/ && python -m unittest test_harness.TanhTest
-cd ~/DLRM_FlexFlow/src/ops/tests/ && python -m unittest test_harness.DotCompressorTest
-
diff --git a/tests/ops/test_run_FF_target.sh b/tests/ops/test_run_FF_target.sh
deleted file mode 100755
index 94c4426838..0000000000
--- a/tests/ops/test_run_FF_target.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/bin/bash
-test_target="$1"
-numgpu="$2"
-eval "./${test_target}" -ll:gpu "${numgpu}" -ll:cpu 4  -ll:util "${numgpu}"  -dm:memorize --strategy "../../runtime/dlrm_strategy_emb_1_gpu_${numgpu}_node_1.pb"
diff --git a/tests/ops/test_utils.cc b/tests/ops/test_utils.cc
deleted file mode 100644
index 17eb335b5a..0000000000
--- a/tests/ops/test_utils.cc
+++ /dev/null
@@ -1,253 +0,0 @@
-#include "test_utils.h"
-
-#define PRECISION 6
-#define MAX_DATASET_PATH_LEN 1023
-
-struct ArgsConfig {
-  char dataset_path[MAX_DATASET_PATH_LEN];
-  char data_type[30];
-  int num_dim;
-};
-
-void initialize_tensor_from_file(const std::string file_path,
-                                 Tensor label,
-                                 FFModel const &ff,
-                                 std::string data_type,
-                                 int num_dim);
-
-void initialize_tensor_gradient_from_file(const std::string file_path,
-                                          Tensor label,
-                                          FFModel const &ff,
-                                          std::string data_type,
-                                          int num_dim) {
-  Context ctx = ff.config.lg_ctx;
-  Runtime *runtime = ff.config.lg_hlr;
-  ArgsConfig args_config;
-  strcpy(args_config.dataset_path, file_path.c_str());
-  strcpy(args_config.data_type, data_type.c_str());
-  if (num_dim == 1) {
-    TaskLauncher launcher(INIT_TENSOR_1D_FROM_FILE_CPU_TASK,
-                          TaskArgument(&args_config, sizeof(args_config)));
-    // regions[0]: full_sparse_input
-    launcher.add_region_requirement(RegionRequirement(label.region_grad,
-                                                      WRITE_ONLY,
-                                                      EXCLUSIVE,
-                                                      label.region_grad,
-                                                      MAP_TO_FB_MEMORY));
-    launcher.add_field(0, FID_DATA);
-    runtime->execute_task(ctx, launcher);
-  } else if (num_dim == 2) {
-    TaskLauncher launcher(INIT_TENSOR_2D_FROM_FILE_CPU_TASK,
-                          TaskArgument(&args_config, sizeof(args_config)));
-    // regions[0]: full_sparse_input
-    launcher.add_region_requirement(RegionRequirement(label.region_grad,
-                                                      WRITE_ONLY,
-                                                      EXCLUSIVE,
-                                                      label.region_grad,
-                                                      MAP_TO_FB_MEMORY));
-    launcher.add_field(0, FID_DATA);
-    runtime->execute_task(ctx, launcher);
-  } else if (num_dim == 3) {
-    TaskLauncher launcher(INIT_TENSOR_3D_FROM_FILE_CPU_TASK,
-                          TaskArgument(&args_config, sizeof(args_config)));
-    // regions[0]: full_sparse_input
-    launcher.add_region_requirement(RegionRequirement(label.region_grad,
-                                                      WRITE_ONLY,
-                                                      EXCLUSIVE,
-                                                      label.region_grad,
-                                                      MAP_TO_FB_MEMORY));
-    launcher.add_field(0, FID_DATA);
-    runtime->execute_task(ctx, launcher);
-  } else if (num_dim == 4) {
-    TaskLauncher launcher(INIT_TENSOR_4D_FROM_FILE_CPU_TASK,
-                          TaskArgument(&args_config, sizeof(args_config)));
-    // regions[0]: full_sparse_input
-    launcher.add_region_requirement(RegionRequirement(label.region_grad,
-                                                      WRITE_ONLY,
-                                                      EXCLUSIVE,
-                                                      label.region_grad,
-                                                      MAP_TO_FB_MEMORY));
-    launcher.add_field(0, FID_DATA);
-    runtime->execute_task(ctx, launcher);
-  } else {
-    throw 255;
-  }
-}
-
-void initialize_tensor_from_file(const std::string file_path,
-                                 Tensor label,
-                                 FFModel const &ff,
-                                 std::string data_type,
-                                 int num_dim) {
-  Context ctx = ff.config.lg_ctx;
-  Runtime *runtime = ff.config.lg_hlr;
-  ArgsConfig args_config;
-  strcpy(args_config.dataset_path, file_path.c_str());
-  strcpy(args_config.data_type, data_type.c_str());
-  if (num_dim == 1) {
-    TaskLauncher launcher(INIT_TENSOR_1D_FROM_FILE_CPU_TASK,
-                          TaskArgument(&args_config, sizeof(args_config)));
-    launcher.add_region_requirement(RegionRequirement(
-        label.region, WRITE_ONLY, EXCLUSIVE, label.region, MAP_TO_FB_MEMORY));
-    launcher.add_field(0, FID_DATA);
-    runtime->execute_task(ctx, launcher);
-  } else if (num_dim == 2) {
-    TaskLauncher launcher(INIT_TENSOR_2D_FROM_FILE_CPU_TASK,
-                          TaskArgument(&args_config, sizeof(args_config)));
-    launcher.add_region_requirement(RegionRequirement(
-        label.region, WRITE_ONLY, EXCLUSIVE, label.region, MAP_TO_FB_MEMORY));
-    launcher.add_field(0, FID_DATA);
-    runtime->execute_task(ctx, launcher);
-  } else if (num_dim == 3) {
-    TaskLauncher launcher(INIT_TENSOR_3D_FROM_FILE_CPU_TASK,
-                          TaskArgument(&args_config, sizeof(args_config)));
-    launcher.add_region_requirement(RegionRequirement(
-        label.region, WRITE_ONLY, EXCLUSIVE, label.region, MAP_TO_FB_MEMORY));
-    launcher.add_field(0, FID_DATA);
-    runtime->execute_task(ctx, launcher);
-  } else if (num_dim == 4) {
-    TaskLauncher launcher(INIT_TENSOR_4D_FROM_FILE_CPU_TASK,
-                          TaskArgument(&args_config, sizeof(args_config)));
-    launcher.add_region_requirement(RegionRequirement(
-        label.region, WRITE_ONLY, EXCLUSIVE, label.region, MAP_TO_FB_MEMORY));
-    launcher.add_field(0, FID_DATA);
-    runtime->execute_task(ctx, launcher);
-  } else {
-    throw 255;
-  }
-}
-
-template <int DIM>
-void initialize_tensor_from_file_task(
-    Task const *task,
-    std::vector<PhysicalRegion> const &regions,
-    Context ctx,
-    Runtime *runtime) {
-  const ArgsConfig args_config = *((ArgsConfig const *)task->args);
-  std::string file_path((char const *)args_config.dataset_path);
-  std::string data_type((char const *)args_config.data_type);
-  Rect<DIM> rect_label_tensor = runtime->get_index_space_domain(
-      ctx, task->regions[0].region.get_index_space());
-  if (data_type == "int") {
-    AccessorWO<int, DIM> const acc_label_tensor(regions[0], FID_DATA);
-    int *tensor_ptr = acc_label_tensor.ptr(rect_label_tensor.lo);
-    std::fstream myfile(file_path, std::ios_base::in);
-    int a;
-    int i = 0;
-    while (myfile >> a) {
-      tensor_ptr[i] = a;
-      i++;
-    }
-    myfile.close();
-  } else if (data_type == "float") {
-    AccessorWO<float, DIM> const acc_label_tensor(regions[0], FID_DATA);
-    float *tensor_ptr = acc_label_tensor.ptr(rect_label_tensor.lo);
-    std::fstream myfile(file_path, std::ios_base::in);
-    float a;
-    int i = 0;
-    while (myfile >> a) {
-      tensor_ptr[i] = a;
-      i++;
-    }
-    myfile.close();
-  }
-}
-
-void dump_region_to_file(FFModel &ff,
-                         LogicalRegion &region,
-                         std::string file_path,
-                         int dims) {
-  Context ctx = ff.config.lg_ctx;
-  Runtime *runtime = ff.config.lg_hlr;
-  ArgsConfig args_config;
-  strcpy(args_config.dataset_path, file_path.c_str());
-  if (dims == 2) {
-    TaskLauncher launcher(DUMP_TENSOR_2D_CPU_TASK,
-                          TaskArgument(&args_config, sizeof(args_config)));
-    launcher.add_region_requirement(RegionRequirement(
-        region, READ_WRITE, EXCLUSIVE, region, MAP_TO_ZC_MEMORY));
-    launcher.add_field(0, FID_DATA);
-    runtime->execute_task(ctx, launcher);
-  } else if (dims == 3) {
-    TaskLauncher launcher(DUMP_TENSOR_3D_CPU_TASK,
-                          TaskArgument(&args_config, sizeof(args_config)));
-    launcher.add_region_requirement(RegionRequirement(
-        region, READ_WRITE, EXCLUSIVE, region, MAP_TO_ZC_MEMORY));
-    launcher.add_field(0, FID_DATA);
-    runtime->execute_task(ctx, launcher);
-
-  } else if (dims == 4) {
-    TaskLauncher launcher(DUMP_TENSOR_4D_CPU_TASK,
-                          TaskArgument(&args_config, sizeof(args_config)));
-    launcher.add_region_requirement(RegionRequirement(
-        region, READ_WRITE, EXCLUSIVE, region, MAP_TO_ZC_MEMORY));
-    launcher.add_field(0, FID_DATA);
-    runtime->execute_task(ctx, launcher);
-
-  } else {
-    std::cout << "dims: " << dims << std::endl;
-    // not supported
-    throw 255;
-  }
-}
-
-template <int DIM>
-void dump_tensor_task(Task const *task,
-                      std::vector<PhysicalRegion> const &regions,
-                      Context ctx,
-                      Runtime *runtime) {
-  assert(task->regions.size() == 1);
-  assert(regions.size() == 1);
-  const ArgsConfig args_config = *((ArgsConfig const *)task->args);
-  std::string file_path((char const *)args_config.dataset_path);
-  AccessorRO<float, DIM> const acc_tensor(regions[0], FID_DATA);
-  Rect<DIM> rect_fb = runtime->get_index_space_domain(
-      ctx, task->regions[0].region.get_index_space());
-  assert(acc_tensor.accessor.is_dense_arbitrary(rect_fb));
-  float const *tensor_ptr = acc_tensor.ptr(rect_fb.lo);
-  std::ofstream myfile;
-  myfile.open(file_path);
-  for (size_t i = 0; i < rect_fb.volume(); ++i) {
-    // printf("%.6lf ", (float)tensor_ptr[i]);
-    myfile << std::fixed << std::setprecision(PRECISION) << (float)tensor_ptr[i]
-           << " ";
-  }
-  myfile.close();
-}
-
-template void dump_tensor_task<1>(Task const *task,
-                                  std::vector<PhysicalRegion> const &regions,
-                                  Context ctx,
-                                  Runtime *runtime);
-template void dump_tensor_task<2>(Task const *task,
-                                  std::vector<PhysicalRegion> const &regions,
-                                  Context ctx,
-                                  Runtime *runtime);
-template void dump_tensor_task<3>(Task const *task,
-                                  std::vector<PhysicalRegion> const &regions,
-                                  Context ctx,
-                                  Runtime *runtime);
-template void dump_tensor_task<4>(Task const *task,
-                                  std::vector<PhysicalRegion> const &regions,
-                                  Context ctx,
-                                  Runtime *runtime);
-template void initialize_tensor_from_file_task<1>(
-    Task const *task,
-    std::vector<PhysicalRegion> const &regions,
-    Context ctx,
-    Runtime *runtime);
-template void initialize_tensor_from_file_task<2>(
-    Task const *task,
-    std::vector<PhysicalRegion> const &regions,
-    Context ctx,
-    Runtime *runtime);
-template void initialize_tensor_from_file_task<3>(
-    Task const *task,
-    std::vector<PhysicalRegion> const &regions,
-    Context ctx,
-    Runtime *runtime);
-template void initialize_tensor_from_file_task<4>(
-    Task const *task,
-    std::vector<PhysicalRegion> const &regions,
-    Context ctx,
-    Runtime *runtime);
diff --git a/tests/ops/transpose_test.cc b/tests/ops/transpose_test.cc
deleted file mode 100644
index 10481aa14f..0000000000
--- a/tests/ops/transpose_test.cc
+++ /dev/null
@@ -1,104 +0,0 @@
-#include "model.h"
-#include "test_utils.h"
-#include <fstream>
-#include <iomanip>
-#include <iostream>
-#include <sstream>
-using namespace Legion;
-LegionRuntime::Logger::Category log_app("transpose_test");
-
-struct TransposeTestMeta {
-  int m, k, d;
-  TransposeTestMeta(int _m, int _k, int _d) {
-    m = _m, k = _k, d = _d;
-  }
-};
-
-TransposeTestMeta get_test_meta(const std::string file_path) {
-  std::fstream myfile(file_path, std::ios_base::in);
-  int m, k, d;
-  myfile >> m >> k >> d;
-  return TransposeTestMeta(m, k, d);
-}
-
-void top_level_task(Task const *task,
-                    std::vector<PhysicalRegion> const &regions,
-                    Context ctx,
-                    Runtime *runtime) {
-  // std::cout<< "test framework launched" << std::endl;
-  auto test_meta = get_test_meta("test_meta.txt");
-  FFConfig ffConfig;
-  // create ff model object
-  FFModel ff(ffConfig);
-  // create input tensor
-  Tensor dense_input;
-  {
-    int const dims[3] = {
-        test_meta.d, test_meta.m, test_meta.k}; // target shape (d,m,k)
-    dense_input = ff.create_tensor<3>(dims, "transpose", DT_FLOAT);
-  }
-  // build transpose layer
-  Tensor ret = ff.transpose("transpose", dense_input);
-  // load inputs tensors and output gradients tensors for testing
-  auto input1_file_path = "test_input1.txt";
-  auto output_grad_file_path = "test_output_grad.txt";
-  initialize_tensor_from_file(input1_file_path, dense_input, ff, "float", 3);
-  initialize_tensor_gradient_from_file(
-      output_grad_file_path, ret, ff, "float", 3);
-  // run forward and backward to produce results
-  ff.init_layers();
-  ff.forward();
-  ff.backward();
-  // dump results to file for python validation
-  dump_region_to_file(ff, ret.region, "output.txt", 3);
-  dump_region_to_file(ff, dense_input.region_grad, "input1_grad.txt", 3);
-}
-
-void register_custom_tasks() {
-  {
-    TaskVariantRegistrar registrar(INIT_TENSOR_2D_FROM_FILE_CPU_TASK,
-                                   "Load 2d Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<initialize_tensor_from_file_task<2>>(
-        registrar, "Load 2d tensor Task");
-  }
-  {
-    TaskVariantRegistrar registrar(INIT_TENSOR_3D_FROM_FILE_CPU_TASK,
-                                   "Load 3d Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<initialize_tensor_from_file_task<3>>(
-        registrar, "Load 3d tensor Task");
-  }
-  {
-    TaskVariantRegistrar registrar(INIT_TENSOR_4D_FROM_FILE_CPU_TASK,
-                                   "Load 4d Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<initialize_tensor_from_file_task<4>>(
-        registrar, "Load 4d tensor Task");
-  }
-
-  {
-    TaskVariantRegistrar registrar(DUMP_TENSOR_2D_CPU_TASK, "Compare Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<dump_tensor_task<2>>(
-        registrar, "Compare Tensor Task");
-  }
-  {
-    TaskVariantRegistrar registrar(DUMP_TENSOR_4D_CPU_TASK, "Compare Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<dump_tensor_task<4>>(
-        registrar, "Compare Tensor Task");
-  }
-  {
-    TaskVariantRegistrar registrar(DUMP_TENSOR_3D_CPU_TASK, "Compare Tensor");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    Runtime::preregister_task_variant<dump_tensor_task<3>>(
-        registrar, "Compare Tensor Task");
-  }
-}
diff --git a/tests/ops/transpose_test.cu b/tests/ops/transpose_test.cu
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/tests/python_interface_test.sh b/tests/python_interface_test.sh
deleted file mode 100755
index 1721159a6b..0000000000
--- a/tests/python_interface_test.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#! /usr/bin/env bash
-set -x
-set -e
-
-check_python_interface() {
-	# Usage: check_python_interface {python, flexflow_python}
-	GPUS=1
-	BATCHSIZE=$((GPUS * 64))
-	FSIZE=14048
-	ZSIZE=12192
-	interpreter=${1:-python}
-	if [[ "$interpreter" == "python" ]]; then
-		export FF_USE_NATIVE_PYTHON=1
-		EXE="python"
-		echo "Running a single-GPU Python test to check the Python interface (native python interpreter)"
-		$EXE "$FF_HOME"/examples/python/keras/seq_mnist_mlp.py -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel
-		unset FF_USE_NATIVE_PYTHON
-	elif [[ "$interpreter" == "flexflow_python" ]]; then
-		EXE="$FF_HOME"/python/flexflow_python
-		echo "Running a single-GPU Python test to check the Python interface (flexflow_python interpreter)"
-		$EXE "$FF_HOME"/examples/python/keras/seq_mnist_mlp.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel
-	else
-		echo "Invalid Python interpreter"
-		exit 1
-	fi
-}
-
-
-if [ -z "$FF_HOME" ]; then echo "FF_HOME variable is not defined, aborting tests"; exit; fi
-
-installation_status=${1:-"before-installation"}
-echo "Running Python interface tests (installation status: ${installation_status})"
-if [[ "$installation_status" == "before-installation" ]]; then
-	# Import flexflow.core module in Python
-	export PYTHONPATH="${FF_HOME}/python"
-	python -c "import flexflow.core; exit()"
-	unset PYTHONPATH
-	# Run a single-gpu test using the flexflow_python interpreter
-	check_python_interface flexflow_python
-	# Run a single-gpu test using the native python interpreter
-	export PYTHONPATH="${FF_HOME}/python:${FF_HOME}/build/python"
-	check_python_interface python
-	unset PYTHONPATH
-elif [[ "$installation_status" == "after-installation" ]]; then
-	# Import flexflow.core module in Python
-	python -c "import flexflow.core; exit()"
-	# Run a single-gpu test using the flexflow_python interpreter
-	check_python_interface flexflow_python
-	# Run a single-gpu test using the native python interpreter
-	check_python_interface python
-else
-	echo "Invalid installation status!"
-	echo "Usage: $0 {before-installation, after-installation}"
-	exit 1
-fi
diff --git a/triton/CMakeLists.txt b/triton/CMakeLists.txt
deleted file mode 100644
index 1377bf64e7..0000000000
--- a/triton/CMakeLists.txt
+++ /dev/null
@@ -1,186 +0,0 @@
-#------------------------------------------------------------------------------#
-# Copyright 2022 NVIDIA CORPORATION
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#------------------------------------------------------------------------------#
-
-cmake_minimum_required(VERSION 3.18)
-
-project(tritonlegionbackend LANGUAGES C CXX)
-
-include(GNUInstallDirs)
-
-#
-# Options
-#
-# To build the Legion backend you must :
-#
-#   - Point to the "legion/runtime" directory in a Legion repo using
-#     TRITON_LEGION_LG_RT_DIR
-#
-option(TRITON_ENABLE_GPU "Enable GPU support in backend" ON)
-option(TRITON_LEGION_BACKEND_BUILD_TEST "Build the unit tests for the backend" ON)
-set(TRITON_BACKEND_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/backend repo")
-set(TRITON_CORE_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/core repo")
-set(TRITON_COMMON_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/common repo")
-
-set(GASNET_CONDUIT "mpi" CACHE STRING "Conduit used in Gasnet")
-option(LEGION_USE_NETWORK "Include support for multi-node execution" ON)
-
-if (WIN32)
-  message(FATAL_ERROR
-    "Legion backend is currently not supported for Windows")
-endif() # WIN32
-
-if(NOT CMAKE_BUILD_TYPE)
-  set(CMAKE_BUILD_TYPE Release)
-endif()
-
-#
-# Gasnet
-#
-include(ExternalProject)
-if(${LEGION_USE_NETWORK})
-  ExternalProject_Add(gasnet
-    PREFIX gasnet
-    GIT_REPOSITORY "https://github.com/StanfordLegion/gasnet.git"
-    GIT_TAG "master"
-    SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/gasnet/src/gasnet"
-    CONFIGURE_COMMAND ""
-    BUILD_COMMAND $(MAKE) CONDUIT=${GASNET_CONDUIT}
-    BUILD_IN_SOURCE 1
-    INSTALL_COMMAND ""
-    TEST_COMMAND ""
-  )
-set(GASNET_ROOT ${CMAKE_CURRENT_BINARY_DIR}/gasnet/src/gasnet/release)
-# Setting variables for Legion build
-set(LEGION_NETWORKS gasnetex)
-set(LEGION_DEPENDS gasnet)
-endif()
-
-#
-# Legion
-#
-include(ExternalProject)
-ExternalProject_Add(legion
-  PREFIX legion
-  GIT_REPOSITORY "https://gitlab.com/StanfordLegion/legion.git"
-  GIT_TAG "control_replication"
-  SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/legion/src/legion"
-  CMAKE_CACHE_ARGS
-    ${_CMAKE_ARGS_CMAKE_TOOLCHAIN_FILE}
-    ${_CMAKE_ARGS_VCPKG_TARGET_TRIPLET}
-    -DCMAKE_BUILD_TYPE:STRING=RELEASE
-    -DBUILD_SHARED_LIBS:BOOL=OFF
-    -DLegion_MAX_DIM:STRING=4
-    -DLegion_MAX_FIELDS:STRING=64
-    -DLegion_OUTPUT_LEVEL:STRING=INFO
-    -DLegion_USE_CUDA:BOOL=${TRITON_ENABLE_GPU}
-    -DLegion_USE_ZLIB:BOOL=ON
-    -DLegion_USE_LIBD:BOOLL=ON
-    -DBUILD_MARCH:STRING=native
-    -DLegion_NETWORKS:STRING=${LEGION_NETWORKS}
-    -DGASNet_CONDUIT:STRING=${GASNET_CONDUIT}
-    -DGASNet_ROOT_DIR:PATH=${GASNET_ROOT}
-    -DCMAKE_INSTALL_PREFIX:PATH=${CMAKE_CURRENT_BINARY_DIR}/legion
-    DEPENDS ${LEGION_DEPENDS}
-)
-# FIXME what CUDA arch to use?
-# -DLegion_CUDA_ARCH=70
-set(_FINDPACKAGE_LEGION_CONFIG_DIR "${CMAKE_CURRENT_BINARY_DIR}/legion/share/Legion/cmake")
-
-#
-# Protobuf
-#
-ExternalProject_Add(protobuf-repo
-  PREFIX protobuf-repo
-  GIT_REPOSITORY "https://github.com/protocolbuffers/protobuf.git"
-  GIT_TAG "v3.17.1"
-  SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/protobuf-repo/src/protobuf"
-  CONFIGURE_COMMAND ""
-  BUILD_COMMAND ""
-  INSTALL_COMMAND ""
-  TEST_COMMAND ""
-)
-
-#
-# Build protobuf project from protobuf-repo
-#
-# Use -DCMAKE_BUILD_TYPE:STRING=RELEASE to workaround the protobuf build issue
-# described in https://stackoverflow.com/a/60218066/2091555
-ExternalProject_Add(protobuf
-  PREFIX protobuf
-  SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/protobuf-repo/src/protobuf/cmake"
-  DOWNLOAD_COMMAND ""
-  CMAKE_CACHE_ARGS
-    ${_CMAKE_ARGS_CMAKE_TOOLCHAIN_FILE}
-    ${_CMAKE_ARGS_VCPKG_TARGET_TRIPLET}
-    -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-    -Dprotobuf_BUILD_TESTS:BOOL=OFF
-    -Dprotobuf_WITH_ZLIB:BOOL=OFF
-    -Dprotobuf_MSVC_STATIC_RUNTIME:BOOL=OFF
-    -DCMAKE_BUILD_TYPE:STRING=RELEASE
-    -DBUILD_SHARED_LIBS:STRING=no
-    -DCMAKE_INSTALL_PREFIX:PATH=${CMAKE_CURRENT_BINARY_DIR}/protobuf
-  DEPENDS protobuf-repo
-)
-set(_FINDPACKAGE_PROTOBUF_CONFIG_DIR "${CMAKE_CURRENT_BINARY_DIR}/protobuf/${CMAKE_INSTALL_LIBDIR}/cmake/protobuf")
-set(TRITON_LEGION_PROTOBUF_DIR ${CMAKE_CURRENT_BINARY_DIR}/protobuf)
-
-#
-# Build googletest project from protobuf-repo
-#
-if(${TRITON_LEGION_BACKEND_BUILD_TEST})
-  ExternalProject_Add(googletest
-    PREFIX protobuf
-    SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/protobuf-repo/src/protobuf/third_party/googletest"
-    DOWNLOAD_COMMAND ""
-    CMAKE_CACHE_ARGS
-      ${_CMAKE_ARGS_CMAKE_TOOLCHAIN_FILE}
-      ${_CMAKE_ARGS_VCPKG_TARGET_TRIPLET}
-      -DCMAKE_BUILD_TYPE:STRING=RELEASE
-      -DCMAKE_INSTALL_PREFIX:PATH=${CMAKE_CURRENT_BINARY_DIR}/googletest
-    DEPENDS protobuf-repo
-  )
-endif()
-
-#
-# Build Triton Legion Backend
-#
-set(BACKEND_DEPENDS protobuf legion)
-if(${TRITON_LEGION_BACKEND_BUILD_TEST})
-  set(BACKEND_DEPENDS ${BACKEND_DEPENDS} googletest)
-endif()
-ExternalProject_Add(triton-legion-backend
-  PREFIX triton-legion-backend
-  SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/src"
-  BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/triton-legion-backend"
-  BUILD_ALWAYS 1
-  CMAKE_CACHE_ARGS
-    -DProtobuf_DIR:PATH=${_FINDPACKAGE_PROTOBUF_CONFIG_DIR}
-    -DLegion_DIR:PATH=${_FINDPACKAGE_LEGION_CONFIG_DIR}
-    -DLEGION_ROOT:PATH=${CMAKE_CURRENT_BINARY_DIR}/legion
-    -DGTEST_ROOT:PATH=${CMAKE_CURRENT_BINARY_DIR}/googletest
-    ${_CMAKE_ARGS_CMAKE_TOOLCHAIN_FILE}
-    ${_CMAKE_ARGS_VCPKG_TARGET_TRIPLET}
-    -DTRITON_ENABLE_GPU:BOOL=${TRITON_ENABLE_GPU}
-    -DTRITON_COMMON_REPO_TAG:STRING=${TRITON_COMMON_REPO_TAG}
-    -DTRITON_CORE_REPO_TAG:STRING=${TRITON_CORE_REPO_TAG}
-    -DTRITON_BACKEND_REPO_TAG:STRING=${TRITON_BACKEND_REPO_TAG}
-    -DTRITON_LEGION_BACKEND_BUILD_TEST:BOOL=${TRITON_LEGION_BACKEND_BUILD_TEST}
-    -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE}
-    -DCMAKE_INSTALL_PREFIX:PATH=${CMAKE_CURRENT_BINARY_DIR}/triton-legion
-  DEPENDS ${BACKEND_DEPENDS}
-)
-
-unset(CMAKE_INSTALL_PREFIX CACHE)
diff --git a/triton/Dockerfile.QA b/triton/Dockerfile.QA
deleted file mode 100644
index 7f1ccf3d3f..0000000000
--- a/triton/Dockerfile.QA
+++ /dev/null
@@ -1,95 +0,0 @@
-#------------------------------------------------------------------------------#
-# Copyright 2022 NVIDIA CORPORATION
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#------------------------------------------------------------------------------#
-
-#
-# Multistage build.
-#
-
-ARG BUILD_IMAGE=nvcr.io/nvidia/tritonserver:21.06-py3-min
-ARG SDK_IMAGE=nvcr.io/nvidia/tritonserver:21.06-py3-sdk
-ARG TRITON_IMAGE=nvcr.io/nvidia/tritonserver:21.06-py3
-ARG TRITON_COMMON_REPO_TAG=main
-ARG TRITON_CORE_REPO_TAG=main
-ARG TRITON_BACKEND_REPO_TAG=main
-
-############################################################################
-## Build legion backend from the BUILD_IMAGE since it has already been configured
-## correctly and has some existing build artifacts. Copy artifacts
-## into QA area.
-############################################################################
-FROM ${BUILD_IMAGE} AS build
-
-# Ensure apt-get won't prompt for selecting options
-ENV DEBIAN_FRONTEND=noninteractive
-
-ARG TRITON_COMMON_REPO_TAG
-ARG TRITON_CORE_REPO_TAG
-ARG TRITON_BACKEND_REPO_TAG
-
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends \
-            zlib1g-dev \
-            rapidjson-dev \
-            software-properties-common && \
-    rm -rf /var/lib/apt/lists/*
-
-# Legion backend build requires recent version of CMake (FetchContent required)
-RUN wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | \
-      gpg --dearmor - |  \
-      tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null && \
-    apt-add-repository 'deb https://apt.kitware.com/ubuntu/ focal main' && \
-    apt-get update && \
-    apt-get install -y --no-install-recommends \
-      cmake-data=3.18.4-0kitware1ubuntu20.04.1 cmake=3.18.4-0kitware1ubuntu20.04.1
-
-WORKDIR /workspace
-COPY . .
-RUN cmake -DTRITON_COMMON_REPO_TAG:STRING=${TRITON_COMMON_REPO_TAG} \
-          -DTRITON_CORE_REPO_TAG:STRING=${TRITON_CORE_REPO_TAG} \
-          -DTRITON_BACKEND_REPO_TAG:STRING=${TRITON_BACKEND_REPO_TAG} . && \
-    make -j16
-
-# Introduce the name
-FROM ${SDK_IMAGE} AS sdk
-
-############################################################################
-## Create CI enabled image
-############################################################################
-FROM ${TRITON_IMAGE}
-
-WORKDIR /opt/tritonserver
-COPY --chown=1000:1000 --from=build /workspace/qa/ qa/
-COPY --chown=1000:1000 --from=build /workspace/triton-legion/backends/legion backends/legion/.
-COPY --chown=1000:1000 --from=build /workspace/triton-legion/test/onnx_parser_test qa/L0_parser/.
-COPY --chown=1000:1000 --from=build /workspace/triton-legion/test/data/* qa/L0_parser/data/
-COPY --chown=1000:1000 --from=sdk /workspace/install/python/triton*.whl qa/pkgs/
-
-# Ensure apt-get won't prompt for selecting options
-ENV DEBIAN_FRONTEND=noninteractive
-
-# Install dependencies for running Legion backend
-RUN apt-get update && apt-get install -y --no-install-recommends \
-                              openmpi-bin && \
-    rm -rf /var/lib/apt/lists/*
-
-# CI/QA expects "python" executable (not python3).
-RUN rm -f /usr/bin/python && \
-    ln -s /usr/bin/python3 /usr/bin/python
-# Install Python packages for test
-RUN pip3 install --upgrade numpy
-RUN find qa/pkgs/ -maxdepth 1 -type f -name \
-    "tritonclient-*-manylinux1_x86_64.whl" | xargs printf -- '%s[all]' | \
-    xargs pip3 install --upgrade
diff --git a/triton/README.md b/triton/README.md
deleted file mode 100644
index d4a6ae40dd..0000000000
--- a/triton/README.md
+++ /dev/null
@@ -1,48 +0,0 @@
-# Legion Triton Backend
-
-This directory contains an incomplete prototype for a new 
-[backend for Triton](https://github.com/triton-inference-server/backend) built on top of the 
-[Legion runtime](https://legion.stanford.edu) for handling multi-node multi-GPU inference
-requests. While Legion is the primary runtime carrying out multi-node inference jobs, users
-do not need to understand Legion at all to use this backend.  
-
-## Build instructions
-
-### CMake
-
-A simple CMake is provided to build Legion backend and to resolve its dependencies.
-Note that the build will install protobuf with customized settting, please make sure
-that the system doesn't have protobuf installed to avoid conflict.
-
-```
-$ mkdir build
-$ cd build
-$ cmake  ..
-$ make
-```
-
-After build, the backend shared library can be found at `/PATH/TO/BUILDDIR/triton-legion/backends/legion`
-
-By default, the unit tests and test data are installed at `/PATH/TO/BUILDDIR/triton-legion/test`,
-which can be run after switching the current directory to the installed location.
-
-### Make
-
-Protobuf is required for the backend and it must be installed from source with the following command
-to build the static protobuf library that can be linked with the backend shared library
-
-```
-git clone https://github.com/protocolbuffers/protobuf.git
-git checkout v3.17.1
-cd protobuf/cmake
-cmake -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON -Dprotobuf_BUILD_TESTS:BOOL=OFF -Dprotobuf_WITH_ZLIB:BOOL=OFF -Dprotobuf_MSVC_STATIC_RUNTIME:BOOL=OFF -DCMAKE_BUILD_TYPE:STRING=RELEASE -DBUILD_SHARED_LIBS:STRING=no .
-make install
-```
-
-Set the `LG_RT_DIR` environment variable to point to the `legion/runtime` directory in a Legion repo
-
-Set the `TRITON_DIR` to point to an installation of the Triton server
-
-Go into the `src` directory and type `make`
-
-Copy the `libtriton_flexflow.so` shared object to a triton model repository
diff --git a/triton/cmake/TritonLegionBackendConfig.cmake.in b/triton/cmake/TritonLegionBackendConfig.cmake.in
deleted file mode 100644
index c19a30d1f3..0000000000
--- a/triton/cmake/TritonLegionBackendConfig.cmake.in
+++ /dev/null
@@ -1,29 +0,0 @@
-#------------------------------------------------------------------------------#
-# Copyright 2022 NVIDIA CORPORATION
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#------------------------------------------------------------------------------#
-
-include(CMakeFindDependencyMacro)
-
-get_filename_component(
-  TRITONLEGIONBACKEND_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH
-)
-
-list(APPEND CMAKE_MODULE_PATH ${TRITONLEGIONBACKEND_CMAKE_DIR})
-
-if(NOT TARGET TritonLegionBackend::triton-legion-backend)
-  include("${TRITONLEGIONBACKEND_CMAKE_DIR}/TritonLegionBackendTargets.cmake")
-endif()
-
-set(TRITONLEGIONBACKEND_LIBRARIES TritonLegionBackend::triton-legion-backend)
diff --git a/triton/qa/L0_e2e/models/add/1/model.onnx b/triton/qa/L0_e2e/models/add/1/model.onnx
deleted file mode 100644
index f6c18819c6..0000000000
--- a/triton/qa/L0_e2e/models/add/1/model.onnx
+++ /dev/null
@@ -1,17 +0,0 @@
-model:y
-
-input0
-input1output"Add
-test_graphZ
-input0
-
-
-Z
-input1
-
-
-b
-output
-
-
-B
\ No newline at end of file
diff --git a/triton/qa/L0_e2e/models/add/1/model.strategy b/triton/qa/L0_e2e/models/add/1/model.strategy
deleted file mode 100644
index 5a5f0f558e..0000000000
--- a/triton/qa/L0_e2e/models/add/1/model.strategy
+++ /dev/null
@@ -1 +0,0 @@
-1 Add 0 2 1 1 1 0
\ No newline at end of file
diff --git a/triton/qa/L0_e2e/models/add/config.pbtxt b/triton/qa/L0_e2e/models/add/config.pbtxt
deleted file mode 100644
index 3785f2880d..0000000000
--- a/triton/qa/L0_e2e/models/add/config.pbtxt
+++ /dev/null
@@ -1,39 +0,0 @@
-#------------------------------------------------------------------------------#
-# Copyright 2022 NVIDIA CORPORATION
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#------------------------------------------------------------------------------#
-
-name: "add"
-backend: "legion"
-max_batch_size: 0
-input [
-  {
-    name: "input0"
-    data_type: TYPE_FP32
-    dims: [ 4, 2 ]
-  },
-  {
-    name: "input1"
-    data_type: TYPE_FP32
-    dims: [ 4, 2 ]
-  }
-]
-output [
-  {
-    name: "output"
-    data_type: TYPE_FP32
-    dims: [ 4, 2 ]
-  }
-]
-instance_group [ { kind : KIND_MODEL }]
diff --git a/triton/qa/L0_e2e/models/cast/1/model.onnx b/triton/qa/L0_e2e/models/cast/1/model.onnx
deleted file mode 100644
index e340fd2b94..0000000000
--- a/triton/qa/L0_e2e/models/cast/1/model.onnx
+++ /dev/null
@@ -1,13 +0,0 @@
-model:a
- 
-inputoutput"Cast*	
-to�
-test_graphZ
-input
-
-
-b
-output
-
-
-B
\ No newline at end of file
diff --git a/triton/qa/L0_e2e/models/cast/1/model.strategy b/triton/qa/L0_e2e/models/cast/1/model.strategy
deleted file mode 100644
index 8c9dca0341..0000000000
--- a/triton/qa/L0_e2e/models/cast/1/model.strategy
+++ /dev/null
@@ -1 +0,0 @@
-1 Cast 0 2 1 1 1 0
\ No newline at end of file
diff --git a/triton/qa/L0_e2e/models/cast/config.pbtxt b/triton/qa/L0_e2e/models/cast/config.pbtxt
deleted file mode 100644
index 3a2bb767b1..0000000000
--- a/triton/qa/L0_e2e/models/cast/config.pbtxt
+++ /dev/null
@@ -1,34 +0,0 @@
-#------------------------------------------------------------------------------#
-# Copyright 2022 NVIDIA CORPORATION
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#------------------------------------------------------------------------------#
-
-name: "cast"
-backend: "legion"
-max_batch_size: 0
-input [
-  {
-    name: "input"
-    data_type: TYPE_FP32
-    dims: [ 1, 3 ]
-  }
-]
-output [
-  {
-    name: "output"
-    data_type: TYPE_FP64
-    dims: [ 1, 3 ]
-  }
-]
-instance_group [ { kind : KIND_MODEL }]
diff --git a/triton/qa/L0_e2e/models/identity/1/model.onnx b/triton/qa/L0_e2e/models/identity/1/model.onnx
deleted file mode 100644
index da9fc01ed0..0000000000
--- a/triton/qa/L0_e2e/models/identity/1/model.onnx
+++ /dev/null
@@ -1,16 +0,0 @@
-model:j
-
-inputoutput"Identity
-test_graphZ
-input
-
-
-
-
-b 
-output
-
-
-
-
-B
\ No newline at end of file
diff --git a/triton/qa/L0_e2e/models/identity/1/model.strategy b/triton/qa/L0_e2e/models/identity/1/model.strategy
deleted file mode 100644
index 01a3f462b0..0000000000
--- a/triton/qa/L0_e2e/models/identity/1/model.strategy
+++ /dev/null
@@ -1 +0,0 @@
-1 Identity_0 0 4 1 1 1 1 1 0
\ No newline at end of file
diff --git a/triton/qa/L0_e2e/models/identity/config.pbtxt b/triton/qa/L0_e2e/models/identity/config.pbtxt
deleted file mode 100644
index 9b15a29078..0000000000
--- a/triton/qa/L0_e2e/models/identity/config.pbtxt
+++ /dev/null
@@ -1,34 +0,0 @@
-#------------------------------------------------------------------------------#
-# Copyright 2022 NVIDIA CORPORATION
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#------------------------------------------------------------------------------#
-
-name: "identity"
-backend: "legion"
-max_batch_size: 0
-input [
-  {
-    name: "input"
-    data_type: TYPE_FP32
-    dims: [ 4, 1, 5, 5 ]
-  }
-]
-output [
-  {
-    name: "output"
-    data_type: TYPE_FP32
-    dims: [ 4, 1, 5, 5 ]
-  }
-]
-instance_group [ { kind : KIND_MODEL }]
diff --git a/triton/qa/L0_e2e/models/mul/1/model.onnx b/triton/qa/L0_e2e/models/mul/1/model.onnx
deleted file mode 100644
index 1de5b71943..0000000000
--- a/triton/qa/L0_e2e/models/mul/1/model.onnx
+++ /dev/null
@@ -1,17 +0,0 @@
-model:y
-
-input0
-input1output"Mul
-test_graphZ
-input0
-
-
-Z
-input1
-
-
-b
-output
-
-
-B
\ No newline at end of file
diff --git a/triton/qa/L0_e2e/models/mul/1/model.strategy b/triton/qa/L0_e2e/models/mul/1/model.strategy
deleted file mode 100644
index c53fce6be7..0000000000
--- a/triton/qa/L0_e2e/models/mul/1/model.strategy
+++ /dev/null
@@ -1 +0,0 @@
-1 Mul 0 2 1 1 1 0
\ No newline at end of file
diff --git a/triton/qa/L0_e2e/models/mul/config.pbtxt b/triton/qa/L0_e2e/models/mul/config.pbtxt
deleted file mode 100644
index 4fcd1d2784..0000000000
--- a/triton/qa/L0_e2e/models/mul/config.pbtxt
+++ /dev/null
@@ -1,39 +0,0 @@
-#------------------------------------------------------------------------------#
-# Copyright 2022 NVIDIA CORPORATION
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#------------------------------------------------------------------------------#
-
-name: "mul"
-backend: "legion"
-max_batch_size: 0
-input [
-  {
-    name: "input0"
-    data_type: TYPE_FP32
-    dims: [ 4, 2 ]
-  },
-  {
-    name: "input1"
-    data_type: TYPE_FP32
-    dims: [ 4, 2 ]
-  }
-]
-output [
-  {
-    name: "output"
-    data_type: TYPE_FP32
-    dims: [ 4, 2 ]
-  }
-]
-instance_group [ { kind : KIND_MODEL }]
diff --git a/triton/qa/L0_e2e/models/reciprocal/1/model.onnx b/triton/qa/L0_e2e/models/reciprocal/1/model.onnx
deleted file mode 100644
index 1eeb4c9103..0000000000
--- a/triton/qa/L0_e2e/models/reciprocal/1/model.onnx
+++ /dev/null
@@ -1,13 +0,0 @@
-model:\
-
-inputoutput"
-Reciprocal
-test_graphZ
-input
-
-
-b
-output
-
-
-B
\ No newline at end of file
diff --git a/triton/qa/L0_e2e/models/reciprocal/1/model.strategy b/triton/qa/L0_e2e/models/reciprocal/1/model.strategy
deleted file mode 100644
index c5f69fab2e..0000000000
--- a/triton/qa/L0_e2e/models/reciprocal/1/model.strategy
+++ /dev/null
@@ -1 +0,0 @@
-1 Reciprocal 0 2 1 1 1 0
\ No newline at end of file
diff --git a/triton/qa/L0_e2e/models/reciprocal/config.pbtxt b/triton/qa/L0_e2e/models/reciprocal/config.pbtxt
deleted file mode 100644
index 411f0cd9f5..0000000000
--- a/triton/qa/L0_e2e/models/reciprocal/config.pbtxt
+++ /dev/null
@@ -1,34 +0,0 @@
-#------------------------------------------------------------------------------#
-# Copyright 2022 NVIDIA CORPORATION
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#------------------------------------------------------------------------------#
-
-name: "reciprocal"
-backend: "legion"
-max_batch_size: 0
-input [
-  {
-    name: "input"
-    data_type: TYPE_FP32
-    dims: [ 1, 3 ]
-  }
-]
-output [
-  {
-    name: "output"
-    data_type: TYPE_FP32
-    dims: [ 1, 3 ]
-  }
-]
-instance_group [ { kind : KIND_MODEL }]
diff --git a/triton/qa/L0_e2e/models/softmax/1/model.onnx b/triton/qa/L0_e2e/models/softmax/1/model.onnx
deleted file mode 100644
index 4a715b10b6..0000000000
Binary files a/triton/qa/L0_e2e/models/softmax/1/model.onnx and /dev/null differ
diff --git a/triton/qa/L0_e2e/models/softmax/1/model.strategy b/triton/qa/L0_e2e/models/softmax/1/model.strategy
deleted file mode 100644
index a742fc5d34..0000000000
--- a/triton/qa/L0_e2e/models/softmax/1/model.strategy
+++ /dev/null
@@ -1 +0,0 @@
-1 Softmax 0 2 1 1 1 0
\ No newline at end of file
diff --git a/triton/qa/L0_e2e/models/softmax/config.pbtxt b/triton/qa/L0_e2e/models/softmax/config.pbtxt
deleted file mode 100755
index 6f187aa0c8..0000000000
--- a/triton/qa/L0_e2e/models/softmax/config.pbtxt
+++ /dev/null
@@ -1,34 +0,0 @@
-#------------------------------------------------------------------------------#
-# Copyright 2022 NVIDIA CORPORATION
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#------------------------------------------------------------------------------#
-
-name: "softmax"
-backend: "legion"
-max_batch_size: 0
-input [
-  {
-    name: "input"
-    data_type: TYPE_FP32
-    dims: [ 3, 1 ]
-  }
-]
-output [
-  {
-    name: "output"
-    data_type: TYPE_FP32
-    dims: [ 3, 1 ]
-  }
-]
-instance_group [ { kind : KIND_MODEL }]
diff --git a/triton/qa/L0_e2e/models/softmax1/1/model.onnx b/triton/qa/L0_e2e/models/softmax1/1/model.onnx
deleted file mode 100644
index 8d74038976..0000000000
--- a/triton/qa/L0_e2e/models/softmax1/1/model.onnx
+++ /dev/null
@@ -1,12 +0,0 @@
-model:Y
-
-inputoutput"Softmax
-test_graphZ
-input
-
-
-b
-output
-
-
-B
\ No newline at end of file
diff --git a/triton/qa/L0_e2e/models/softmax1/1/model.strategy b/triton/qa/L0_e2e/models/softmax1/1/model.strategy
deleted file mode 100644
index 939f196efd..0000000000
--- a/triton/qa/L0_e2e/models/softmax1/1/model.strategy
+++ /dev/null
@@ -1 +0,0 @@
-1 Softmax_1 0 2 1 1 1 0
\ No newline at end of file
diff --git a/triton/qa/L0_e2e/models/softmax1/config.pbtxt b/triton/qa/L0_e2e/models/softmax1/config.pbtxt
deleted file mode 100755
index 39a3f14c83..0000000000
--- a/triton/qa/L0_e2e/models/softmax1/config.pbtxt
+++ /dev/null
@@ -1,34 +0,0 @@
-#------------------------------------------------------------------------------#
-# Copyright 2022 NVIDIA CORPORATION
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#------------------------------------------------------------------------------#
-
-name: "softmax1"
-backend: "legion"
-max_batch_size: 0
-input [
-  {
-    name: "input"
-    data_type: TYPE_FP32
-    dims: [ 3, 1 ]
-  }
-]
-output [
-  {
-    name: "output"
-    data_type: TYPE_FP32
-    dims: [ 3, 1 ]
-  }
-]
-instance_group [ { kind : KIND_MODEL }]
diff --git a/triton/qa/L0_e2e/models/sqrt/1/model.onnx b/triton/qa/L0_e2e/models/sqrt/1/model.onnx
deleted file mode 100644
index f1ded959b8..0000000000
--- a/triton/qa/L0_e2e/models/sqrt/1/model.onnx
+++ /dev/null
@@ -1,12 +0,0 @@
-model:V
-
-inputoutput"Sqrt
-test_graphZ
-input
-
-
-b
-output
-
-
-B
\ No newline at end of file
diff --git a/triton/qa/L0_e2e/models/sqrt/1/model.strategy b/triton/qa/L0_e2e/models/sqrt/1/model.strategy
deleted file mode 100644
index 3c2d5fa515..0000000000
--- a/triton/qa/L0_e2e/models/sqrt/1/model.strategy
+++ /dev/null
@@ -1 +0,0 @@
-1 Sqrt 0 2 1 1 1 0
\ No newline at end of file
diff --git a/triton/qa/L0_e2e/models/sqrt/config.pbtxt b/triton/qa/L0_e2e/models/sqrt/config.pbtxt
deleted file mode 100644
index 237cd6b322..0000000000
--- a/triton/qa/L0_e2e/models/sqrt/config.pbtxt
+++ /dev/null
@@ -1,34 +0,0 @@
-#------------------------------------------------------------------------------#
-# Copyright 2022 NVIDIA CORPORATION
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#------------------------------------------------------------------------------#
-
-name: "sqrt"
-backend: "legion"
-max_batch_size: 0
-input [
-  {
-    name: "input"
-    data_type: TYPE_FP32
-    dims: [ 3, 1 ]
-  }
-]
-output [
-  {
-    name: "output"
-    data_type: TYPE_FP32
-    dims: [ 3, 1 ]
-  }
-]
-instance_group [ { kind : KIND_MODEL }]
diff --git a/triton/qa/L0_e2e/models/sub/1/model.onnx b/triton/qa/L0_e2e/models/sub/1/model.onnx
deleted file mode 100644
index 43c7f2ad75..0000000000
--- a/triton/qa/L0_e2e/models/sub/1/model.onnx
+++ /dev/null
@@ -1,17 +0,0 @@
-model:y
-
-input0
-input1output"Sub
-test_graphZ
-input0
-
-
-Z
-input1
-
-
-b
-output
-
-
-B
\ No newline at end of file
diff --git a/triton/qa/L0_e2e/models/sub/1/model.strategy b/triton/qa/L0_e2e/models/sub/1/model.strategy
deleted file mode 100644
index c9d492edd9..0000000000
--- a/triton/qa/L0_e2e/models/sub/1/model.strategy
+++ /dev/null
@@ -1 +0,0 @@
-1 Sub 0 2 1 1 1 0
\ No newline at end of file
diff --git a/triton/qa/L0_e2e/models/sub/config.pbtxt b/triton/qa/L0_e2e/models/sub/config.pbtxt
deleted file mode 100644
index db5cbe1106..0000000000
--- a/triton/qa/L0_e2e/models/sub/config.pbtxt
+++ /dev/null
@@ -1,39 +0,0 @@
-#------------------------------------------------------------------------------#
-# Copyright 2022 NVIDIA CORPORATION
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#------------------------------------------------------------------------------#
-
-name: "sub"
-backend: "legion"
-max_batch_size: 0
-input [
-  {
-    name: "input0"
-    data_type: TYPE_FP32
-    dims: [ 4, 2 ]
-  },
-  {
-    name: "input1"
-    data_type: TYPE_FP32
-    dims: [ 4, 2 ]
-  }
-]
-output [
-  {
-    name: "output"
-    data_type: TYPE_FP32
-    dims: [ 4, 2 ]
-  }
-]
-instance_group [ { kind : KIND_MODEL }]
diff --git a/triton/qa/L0_e2e/models/tanh/1/model.onnx b/triton/qa/L0_e2e/models/tanh/1/model.onnx
deleted file mode 100644
index 58a49d4422..0000000000
--- a/triton/qa/L0_e2e/models/tanh/1/model.onnx
+++ /dev/null
@@ -1,12 +0,0 @@
-model:V
-
-inputoutput"Tanh
-test_graphZ
-input
-
-
-b
-output
-
-
-B
\ No newline at end of file
diff --git a/triton/qa/L0_e2e/models/tanh/1/model.strategy b/triton/qa/L0_e2e/models/tanh/1/model.strategy
deleted file mode 100644
index 5da2702f4e..0000000000
--- a/triton/qa/L0_e2e/models/tanh/1/model.strategy
+++ /dev/null
@@ -1 +0,0 @@
-1 Tanh 0 2 1 1 1 0
\ No newline at end of file
diff --git a/triton/qa/L0_e2e/models/tanh/config.pbtxt b/triton/qa/L0_e2e/models/tanh/config.pbtxt
deleted file mode 100644
index aa75a1b464..0000000000
--- a/triton/qa/L0_e2e/models/tanh/config.pbtxt
+++ /dev/null
@@ -1,34 +0,0 @@
-#------------------------------------------------------------------------------#
-# Copyright 2022 NVIDIA CORPORATION
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#------------------------------------------------------------------------------#
-
-name: "tanh"
-backend: "legion"
-max_batch_size: 0
-input [
-  {
-    name: "input"
-    data_type: TYPE_FP32
-    dims: [ 3, 1 ]
-  }
-]
-output [
-  {
-    name: "output"
-    data_type: TYPE_FP32
-    dims: [ 3, 1 ]
-  }
-]
-instance_group [ { kind : KIND_MODEL }]
diff --git a/triton/qa/L0_e2e/operator_test.py b/triton/qa/L0_e2e/operator_test.py
deleted file mode 100644
index fe8a8cc912..0000000000
--- a/triton/qa/L0_e2e/operator_test.py
+++ /dev/null
@@ -1,401 +0,0 @@
-#------------------------------------------------------------------------------#
-# Copyright 2022 NVIDIA CORPORATION
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#------------------------------------------------------------------------------#
-
-import unittest
-import numpy as np
-import tritonhttpclient
-from tritonclientutils import InferenceServerException
-from functools import reduce
-import test_helpers as helper
-
-import os
-import sys
-import argparse
-import logging
-
-
-class OperatorTest(unittest.TestCase):
-
-    def setUp(self):
-        self.client = tritonhttpclient.InferenceServerClient(
-            url="localhost:8000")
-
-    def test_identity(self):
-        log = logging.getLogger("Operator Test Logging")
-        log.debug("====== test_identity ======")
-        model_name = "identity"
-
-        # Prepare input
-        input_shape = [4, 1, 5, 5]
-        ec = reduce((lambda x, y: x * y), input_shape)
-        input_data = np.arange(ec, dtype=np.float32).reshape(input_shape)
-        inputs = [tritonhttpclient.InferInput('input', input_shape, "FP32")]
-        inputs[0].set_data_from_numpy(input_data)
-
-        output_name = 'output'
-        outputs = [tritonhttpclient.InferRequestedOutput(output_name)]
-
-        log.debug("input data: {}".format(input_data))
-
-        try:
-            result = self.client.infer(model_name=model_name,
-                                       inputs=inputs,
-                                       outputs=outputs)
-
-            # Validate the results by comparing with precomputed values.
-            output_data = result.as_numpy(output_name)
-            self.assertTrue(
-                np.array_equal(output_data, input_data),
-                "Expect response to have value {}, got {}".format(
-                    input_data, output_data))
-            log.debug("output data: {}".format(output_data))
-        except InferenceServerException as ex:
-            self.assertTrue(False, "unexpected error {}".format(ex))
-        log.debug("====== end of test_identity ======")
-
-    def test_add(self):
-        log = logging.getLogger("Operator Test Logging")
-        log.debug("====== test_add ======")
-        model_name = "add"
-
-        # Prepare input
-        input_shape = [4, 2]
-        ec = reduce((lambda x, y: x * y), input_shape)
-        input_data = np.arange(ec, dtype=np.float32).reshape(input_shape)
-        inputs = [
-            tritonhttpclient.InferInput('input0', input_shape, "FP32"),
-            tritonhttpclient.InferInput('input1', input_shape, "FP32")
-        ]
-        inputs[0].set_data_from_numpy(input_data)
-        inputs[1].set_data_from_numpy(input_data)
-
-        output_name = 'output'
-        outputs = [tritonhttpclient.InferRequestedOutput(output_name)]
-        expected_output_data = input_data + input_data
-
-        log.debug("input data: {}".format(input_data))
-
-        try:
-            result = self.client.infer(model_name=model_name,
-                                       inputs=inputs,
-                                       outputs=outputs)
-
-            # Validate the results by comparing with precomputed values.
-            output_data = result.as_numpy(output_name)
-            self.assertTrue(
-                np.array_equal(output_data, expected_output_data),
-                "Expect response to have value {}, got {}".format(
-                    expected_output_data, output_data))
-            log.debug("output data: {}".format(output_data))
-        except InferenceServerException as ex:
-            self.assertTrue(False, "unexpected error {}".format(ex))
-        log.debug("====== end of test_add ======")
-
-    def test_sub(self):
-        log = logging.getLogger("Operator Test Logging")
-        log.debug("====== test_sub ======")
-        model_name = "sub"
-
-        # Prepare input
-        input_shape = [4, 2]
-        ec = reduce((lambda x, y: x * y), input_shape)
-        input0_data = np.ones(input_shape,
-                              dtype=np.float32).reshape(input_shape)
-        input1_data = np.arange(ec, dtype=np.float32).reshape(input_shape)
-        inputs = [
-            tritonhttpclient.InferInput('input0', input_shape, "FP32"),
-            tritonhttpclient.InferInput('input1', input_shape, "FP32")
-        ]
-        inputs[0].set_data_from_numpy(input0_data)
-        inputs[1].set_data_from_numpy(input1_data)
-
-        output_name = 'output'
-        outputs = [tritonhttpclient.InferRequestedOutput(output_name)]
-        expected_output_data = input0_data - input1_data
-
-        log.debug("input0 data: {}".format(input0_data))
-        log.debug("input1 data: {}".format(input1_data))
-
-        try:
-            result = self.client.infer(model_name=model_name,
-                                       inputs=inputs,
-                                       outputs=outputs)
-
-            # Validate the results by comparing with precomputed values.
-            output_data = result.as_numpy(output_name)
-            self.assertTrue(
-                np.array_equal(output_data, expected_output_data),
-                "Expect response to have value {}, got {}".format(
-                    expected_output_data, output_data))
-            log.debug("output data: {}".format(output_data))
-        except InferenceServerException as ex:
-            self.assertTrue(False, "unexpected error {}".format(ex))
-        log.debug("====== end of test_sub ======")
-
-    def test_mul(self):
-        log = logging.getLogger("Operator Test Logging")
-        log.debug("====== test_mul ======")
-        model_name = "mul"
-
-        # Prepare input
-        input_shape = [4, 2]
-        ec = reduce((lambda x, y: x * y), input_shape)
-        input_data = np.arange(ec, dtype=np.float32).reshape(input_shape)
-        inputs = [
-            tritonhttpclient.InferInput('input0', input_shape, "FP32"),
-            tritonhttpclient.InferInput('input1', input_shape, "FP32")
-        ]
-        inputs[0].set_data_from_numpy(input_data)
-        inputs[1].set_data_from_numpy(input_data)
-
-        output_name = 'output'
-        outputs = [tritonhttpclient.InferRequestedOutput(output_name)]
-        expected_output_data = input_data * input_data
-
-        log.debug("input data: {}".format(input_data))
-
-        try:
-            result = self.client.infer(model_name=model_name,
-                                       inputs=inputs,
-                                       outputs=outputs)
-
-            # Validate the results by comparing with precomputed values.
-            output_data = result.as_numpy(output_name)
-            self.assertTrue(
-                np.array_equal(output_data, expected_output_data),
-                "Expect response to have value {}, got {}".format(
-                    expected_output_data, output_data))
-            log.debug("output data: {}".format(output_data))
-        except InferenceServerException as ex:
-            self.assertTrue(False, "unexpected error {}".format(ex))
-        log.debug("====== end of test_mul ======")
-
-    def test_tanh(self):
-        log = logging.getLogger("Operator Test Logging")
-        log.debug("====== test_tanh ======")
-        model_name = "tanh"
-
-        # Prepare input
-        input_shape = [3, 1]
-        ec = reduce((lambda x, y: x * y), input_shape)
-        input_data = np.arange(ec, dtype=np.float32).reshape(input_shape)
-        inputs = [tritonhttpclient.InferInput('input', input_shape, "FP32")]
-        inputs[0].set_data_from_numpy(input_data)
-
-        output_name = 'output'
-        outputs = [tritonhttpclient.InferRequestedOutput(output_name)]
-
-        log.debug("input data: {}".format(input_data))
-        expected_output_data = np.tanh(input_data)
-
-        try:
-            result = self.client.infer(model_name=model_name,
-                                       inputs=inputs,
-                                       outputs=outputs)
-
-            # Validate the results by comparing with precomputed values.
-            output_data = result.as_numpy(output_name)
-            self.assertTrue(
-                np.array_equal(output_data, expected_output_data),
-                "Expect response to have value {}, got {}".format(
-                    expected_output_data, output_data))
-            log.debug("output data: {}".format(input_data))
-        except InferenceServerException as ex:
-            self.assertTrue(False, "unexpected error {}".format(ex))
-        log.debug("====== end of test_tanh ======")
-
-    def test_reciprocal(self):
-        log = logging.getLogger("Operator Test Logging")
-        log.debug("====== test_reciprocal ======")
-        model_name = "reciprocal"
-
-        # Prepare input
-        input_shape = [1, 3]
-        ec = reduce((lambda x, y: x * y), input_shape)
-        input_data = np.linspace(0, .1, ec,
-                                 dtype=np.float32).reshape(input_shape)
-        inputs = [tritonhttpclient.InferInput('input', input_shape, "FP32")]
-        inputs[0].set_data_from_numpy(input_data)
-
-        output_name = 'output'
-        outputs = [tritonhttpclient.InferRequestedOutput(output_name)]
-
-        log.debug("input data: {}".format(input_data))
-        expected_output_data = np.reciprocal(input_data)
-
-        try:
-            result = self.client.infer(model_name=model_name,
-                                       inputs=inputs,
-                                       outputs=outputs)
-
-            # Validate the results by comparing with precomputed values.
-            output_data = result.as_numpy(output_name)
-            self.assertTrue(
-                np.array_equal(output_data, expected_output_data),
-                "Expect response to have value {}, got {}".format(
-                    expected_output_data, output_data))
-            log.debug("output data: {}".format(input_data))
-        except InferenceServerException as ex:
-            self.assertTrue(False, "unexpected error {}".format(ex))
-        log.debug("====== end of test_reciprocal ======")
-
-    def test_sqrt(self):
-        log = logging.getLogger("Operator Test Logging")
-        log.debug("====== test_sqrt ======")
-        model_name = "sqrt"
-
-        # Prepare input
-        input_shape = [3, 1]
-        ec = reduce((lambda x, y: x * y), input_shape)
-        input_data = np.arange(ec, dtype=np.float32).reshape(input_shape)
-        inputs = [tritonhttpclient.InferInput('input', input_shape, "FP32")]
-        inputs[0].set_data_from_numpy(input_data)
-
-        output_name = 'output'
-        outputs = [tritonhttpclient.InferRequestedOutput(output_name)]
-
-        log.debug("input data: {}".format(input_data))
-        expected_output_data = np.sqrt(input_data)
-
-        try:
-            result = self.client.infer(model_name=model_name,
-                                       inputs=inputs,
-                                       outputs=outputs)
-
-            # Validate the results by comparing with precomputed values.
-            output_data = result.as_numpy(output_name)
-            self.assertTrue(
-                np.array_equal(output_data, expected_output_data),
-                "Expect response to have value {}, got {}".format(
-                    expected_output_data, output_data))
-            log.debug("output data: {}".format(input_data))
-        except InferenceServerException as ex:
-            self.assertTrue(False, "unexpected error {}".format(ex))
-        log.debug("====== end of test_sqrt ======")
-
-    def test_cast(self):
-        log = logging.getLogger("Operator Test Logging")
-        log.debug("====== test_cast ======")
-        model_name = "cast"
-
-        # Prepare input
-        input_shape = [1, 3]
-        ec = reduce((lambda x, y: x * y), input_shape)
-        input_data = np.linspace(0, .1, ec,
-                                 dtype=np.float32).reshape(input_shape)
-        inputs = [tritonhttpclient.InferInput('input', input_shape, "FP32")]
-        inputs[0].set_data_from_numpy(input_data)
-
-        output_name = 'output'
-        outputs = [tritonhttpclient.InferRequestedOutput(output_name)]
-
-        log.debug("input data: {}".format(input_data))
-        expected_output_data = input_data.astype(np.double, copy=True)
-
-        try:
-            result = self.client.infer(model_name=model_name,
-                                       inputs=inputs,
-                                       outputs=outputs)
-
-            # Validate the results by comparing with precomputed values.
-            output_data = result.as_numpy(output_name)
-            self.assertTrue(
-                np.array_equal(output_data, expected_output_data),
-                "Expect response to have value {}, got {}".format(
-                    expected_output_data, output_data))
-            log.debug("output data: {}".format(input_data))
-        except InferenceServerException as ex:
-            self.assertTrue(False, "unexpected error {}".format(ex))
-        log.debug("====== end of test_cast ======")
-
-    def test_softmax(self):
-        log = logging.getLogger("Operator Test Logging")
-        log.debug("====== test_softmax ======")
-        model_name = "softmax"
-
-        # Prepare input
-        input_shape = [3, 1]
-        ec = reduce((lambda x, y: x * y), input_shape)
-        input_data = np.arange(ec, dtype=np.float32).reshape(input_shape)
-        inputs = [tritonhttpclient.InferInput('input', input_shape, "FP32")]
-        inputs[0].set_data_from_numpy(input_data)
-
-        output_name = 'output'
-        outputs = [tritonhttpclient.InferRequestedOutput(output_name)]
-
-        log.debug("input data: {}".format(input_data))
-        expected_output_data = helper.softmax(input_data, 0)
-
-        try:
-            result = self.client.infer(model_name=model_name,
-                                       inputs=inputs,
-                                       outputs=outputs)
-
-            # Validate the results by comparing with precomputed values.
-            output_data = result.as_numpy(output_name)
-            self.assertTrue(
-                np.allclose(output_data, expected_output_data, atol=1e-07),
-                "Expect response to have value {}, got {}".format(
-                    expected_output_data, output_data))
-            log.debug("output data: {}".format(input_data))
-        except InferenceServerException as ex:
-            self.assertTrue(False, "unexpected error {}".format(ex))
-        log.debug("====== end of test_softmax ======")
-
-    def test_softmax_default_axis(self):
-        log = logging.getLogger("Operator Test Logging")
-        log.debug("====== test_softmax_default_axis ======")
-        model_name = "softmax1"
-
-        # Prepare input
-        input_shape = [3, 1]
-        ec = reduce((lambda x, y: x * y), input_shape)
-        input_data = np.arange(ec, dtype=np.float32).reshape(input_shape)
-        inputs = [tritonhttpclient.InferInput('input', input_shape, "FP32")]
-        inputs[0].set_data_from_numpy(input_data)
-
-        output_name = 'output'
-        outputs = [tritonhttpclient.InferRequestedOutput(output_name)]
-
-        log.debug("input data: {}".format(input_data))
-        expected_output_data = helper.softmax(input_data, 1)
-
-        try:
-            result = self.client.infer(model_name=model_name,
-                                       inputs=inputs,
-                                       outputs=outputs)
-
-            # Validate the results by comparing with precomputed values.
-            output_data = result.as_numpy(output_name)
-            self.assertTrue(
-                np.allclose(output_data, expected_output_data, atol=1e-07),
-                "Expect response to have value {}, got {}".format(
-                    expected_output_data, output_data))
-            log.debug("output data: {}".format(input_data))
-        except InferenceServerException as ex:
-            self.assertTrue(False, "unexpected error {}".format(ex))
-        log.debug("====== end of test_softmax_default_axis ======")
-
-
-if __name__ == '__main__':
-    if 'LEGION_BACKEND_TEST_LOG_LEVEL' in os.environ:
-        level_config = {'debug': logging.DEBUG, 'info': logging.INFO}
-        logging.basicConfig(stream=sys.stderr)
-        log_level = level_config[
-            os.environ['LEGION_BACKEND_TEST_LOG_LEVEL'].lower()]
-        logging.getLogger("Operator Test Logging").setLevel(log_level)
-
-    unittest.main()
diff --git a/triton/qa/L0_e2e/test.sh b/triton/qa/L0_e2e/test.sh
deleted file mode 100755
index 27a4e172d9..0000000000
--- a/triton/qa/L0_e2e/test.sh
+++ /dev/null
@@ -1,61 +0,0 @@
-#!/bin/bash
-#------------------------------------------------------------------------------#
-# Copyright 2022 NVIDIA CORPORATION
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#------------------------------------------------------------------------------#
-
-TEST_PY=operator_test.py
-DATADIR="./models"
-SERVER=/opt/tritonserver/bin/tritonserver
-SERVER_ARGS="--model-repository=$DATADIR"
-source ../common/util.sh
-
-rm -f *.log*
-
-RET=0
-
-# 1 GPU 1 node
-export REALM_DEFAULT_ARGS="-ll:gpu 1"
-TEST_LOG="./single_device_single_node.log"
-
-run_server
-if [ "$SERVER_PID" == "0" ]; then
-    echo -e "\n***\n*** Failed to start $SERVER\n***"
-    cat $SERVER_LOG
-    exit 1
-fi
-
-set +e
-python $TEST_PY >>$TEST_LOG 2>&1
-if [ $? -ne 0 ]; then
-    echo -e "\n***\n*** Test Failed\n***"
-    cat $TEST_LOG
-    RET=1
-fi
-set -e
-
-# [issue #7] WAR to ignore core dump on server exit
-set +e
-kill_server
-set -e
-
-# [gluo FIXME] add test for multi-GPU / multi-node
-
-if [ $RET -eq 0 ]; then
-  echo -e "\n***\n*** Test Passed\n***"
-else
-  echo -e "\n***\n*** Test Failed\n***"
-fi
-
-exit $RET
diff --git a/triton/qa/L0_e2e/test_helpers.py b/triton/qa/L0_e2e/test_helpers.py
deleted file mode 100644
index 4ead79f27b..0000000000
--- a/triton/qa/L0_e2e/test_helpers.py
+++ /dev/null
@@ -1,21 +0,0 @@
-#------------------------------------------------------------------------------#
-# Copyright 2022 NVIDIA CORPORATION
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#------------------------------------------------------------------------------#
-
-import numpy as np
-
-def softmax(input, axis):
-    output = np.exp(input - np.max(input, axis, keepdims=True))
-    return output / np.sum(output, axis, keepdims=True)
diff --git a/triton/qa/L0_parser/test.sh b/triton/qa/L0_parser/test.sh
deleted file mode 100755
index 1d06933836..0000000000
--- a/triton/qa/L0_parser/test.sh
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/bin/bash
-#------------------------------------------------------------------------------#
-# Copyright 2022 NVIDIA CORPORATION
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#------------------------------------------------------------------------------#
-
-TEST_BIN=./onnx_parser_test
-
-rm -f *.log*
-
-RET=0
-
-TEST_LOG="./parser.log"
-
-set +e
-$TEST_BIN >>$TEST_LOG 2>&1
-if [ $? -ne 0 ]; then
-    echo -e "\n***\n*** Test Failed\n***"
-    cat $TEST_LOG
-    RET=1
-fi
-set -e
-
-if [ $RET -eq 0 ]; then
-  echo -e "\n***\n*** Test Passed\n***"
-else
-  echo -e "\n***\n*** Test Failed\n***"
-fi
-
-exit $RET
diff --git a/triton/qa/common/util.sh b/triton/qa/common/util.sh
deleted file mode 100755
index 431e2ba97d..0000000000
--- a/triton/qa/common/util.sh
+++ /dev/null
@@ -1,89 +0,0 @@
-#------------------------------------------------------------------------------#
-# Copyright 2022 NVIDIA CORPORATION
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#------------------------------------------------------------------------------#
-
-SERVER_LOG=${SERVER_LOG:=./server.log}
-SERVER_TIMEOUT=${SERVER_TIMEOUT:=60}
-SERVER_LD_PRELOAD=${SERVER_LD_PRELOAD:=""}
-
-# Run inference server. Return once server's health endpoint shows
-# ready or timeout expires. Sets SERVER_PID to pid of SERVER, or 0 if
-# error (including expired timeout)
-function run_server () {
-    SERVER_PID=0
-
-    if [ -z "$SERVER" ]; then
-        echo "=== SERVER must be defined"
-        return
-    fi
-
-    if [ ! -f "$SERVER" ]; then
-        echo "=== $SERVER does not exist"
-        return
-    fi
-
-    if [ -z "$SERVER_LD_PRELOAD" ]; then
-      echo "=== Running $SERVER $SERVER_ARGS"
-    else
-      echo "=== Running LD_PRELOAD=$SERVER_LD_PRELOAD $SERVER $SERVER_ARGS"
-    fi
-
-    LD_PRELOAD=$SERVER_LD_PRELOAD $SERVER $SERVER_ARGS > $SERVER_LOG 2>&1 &
-    SERVER_PID=$!
-
-    wait_for_server_ready $SERVER_PID $SERVER_TIMEOUT
-    if [ "$WAIT_RET" != "0" ]; then
-        kill $SERVER_PID || true
-        SERVER_PID=0
-    fi
-}
-
-# Wait until server health endpoint shows ready. Sets WAIT_RET to 0 on
-# success, 1 on failure
-function wait_for_server_ready() {
-    local spid="$1"; shift
-    local wait_time_secs="${1:-30}"; shift
-
-    WAIT_RET=0
-
-    local wait_secs=$wait_time_secs
-    until test $wait_secs -eq 0 ; do
-        if ! kill -0 $spid; then
-            echo "=== Server not running."
-            WAIT_RET=1
-            return
-        fi
-
-        sleep 1;
-
-        set +e
-        code=`curl -s -w %{http_code} localhost:8000/v2/health/ready`
-        set -e
-        if [ "$code" == "200" ]; then
-            return
-        fi
-
-        ((wait_secs--));
-    done
-
-    echo "=== Timeout $wait_time_secs secs. Server not ready."
-    WAIT_RET=1
-}
-
-# Kill inference server. SERVER_PID must be set to the server's pid.
-function kill_server () {
-    kill $SERVER_PID
-    wait $SERVER_PID
-}
diff --git a/triton/src/CMakeLists.txt b/triton/src/CMakeLists.txt
deleted file mode 100644
index 8f36dc7984..0000000000
--- a/triton/src/CMakeLists.txt
+++ /dev/null
@@ -1,202 +0,0 @@
-#------------------------------------------------------------------------------#
-# Copyright 2022 NVIDIA CORPORATION
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#------------------------------------------------------------------------------#
-
-cmake_minimum_required (VERSION 3.18)
-project (legion-backend)
-
-if(NOT CMAKE_BUILD_TYPE)
-  set(CMAKE_BUILD_TYPE Release)
-endif()
-
-execute_process(COMMAND uname -p OUTPUT_VARIABLE ARCH)
-# FIXME what options should be set?
-# set(CMAKE_CXX_FLAGS "-Wall -Wextra -Wno-unused-parameter -Werror -Wno-deprecated-declarations")
-set(CMAKE_CXX_FLAGS_DEBUG "-g")
-set(CMAKE_CXX_FLAGS_RELEASE "-O3")
-set(CMAKE_CXX_STANDARD 11)
-set(CMAKE_POSITION_INDEPENDENT_CODE ON)
-
-#
-# Dependencies
-#
-# FetchContent's composibility isn't very good. We must include the
-# transitive closure of all repos so that we can override the tag.
-#
-include(FetchContent)
-
-FetchContent_Declare(
-  repo-common
-  GIT_REPOSITORY https://github.com/triton-inference-server/common.git
-  GIT_TAG ${TRITON_COMMON_REPO_TAG}
-)
-FetchContent_Declare(
-  repo-core
-  GIT_REPOSITORY https://github.com/triton-inference-server/core.git
-  GIT_TAG ${TRITON_CORE_REPO_TAG}
-)
-FetchContent_Declare(
-  repo-backend
-  GIT_REPOSITORY https://github.com/triton-inference-server/backend.git
-  GIT_TAG ${TRITON_BACKEND_REPO_TAG}
-)
-FetchContent_MakeAvailable(repo-common repo-core repo-backend)
-
-#
-# CUDA
-#
-if(${TRITON_ENABLE_GPU})
-  find_package(CUDA REQUIRED)
-  find_package(CUDAToolkit REQUIRED)
-  enable_language(CUDA)
-endif() # TRITON_ENABLE_GPU
-
-#
-# Protobuf
-#
-set(protobuf_MODULE_COMPATIBLE TRUE CACHE BOOL "protobuf_MODULE_COMPATIBLE" FORCE)
-find_package(Protobuf CONFIG REQUIRED)
-message(STATUS "Using protobuf ${Protobuf_VERSION}")
-include_directories(${Protobuf_INCLUDE_DIRS})
-
-#
-# Legion backend
-#
-configure_file(${CMAKE_CURRENT_SOURCE_DIR}/libtriton_legion.ldscript libtriton_legion.ldscript COPYONLY)
-find_package(Legion REQUIRED)
-
-# Use customized protoc command to generate cpp files with proper layout
-set(PROTO_SRCS onnx/onnx-data.pb.cc onnx/onnx-ml.pb.cc onnx/onnx-operators-ml.pb.cc)
-add_custom_command(
-  OUTPUT ${PROTO_SRCS}
-  ALL
-  COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
-    -I${CMAKE_CURRENT_SOURCE_DIR} --cpp_out=${CMAKE_CURRENT_BINARY_DIR}
-    onnx/*.proto
-  WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-  COMMENT "Compiling cpp files of the ONNX protos"
-)
-
-file(GLOB OPERATOR_SRCS operators/*.cc)
-if(${TRITON_ENABLE_GPU})
-  file(GLOB OPERATOR_CUDA_SRCS operators/*.cu)
-  set(OPERATOR_SRCS ${OPERATOR_SRCS} ${OPERATOR_CUDA_SRCS})
-endif() # TRITON_ENABLE_GPU
-
-
-add_library(
-  triton-legion-backend SHARED
-  backend.cc
-  model.cc
-  instance.cc
-  onnx_parser.cc
-  ${PROTO_SRCS}
-  runtime.cc
-  operator.cc
-  strategy.cc
-  tensor.cc
-  ${OPERATOR_SRCS}
-)
-
-add_library(
-  TritonLegionBackend::triton-legion-backend ALIAS triton-legion-backend
-)
-
-target_include_directories(
-  triton-legion-backend
-  PRIVATE
-    ${CMAKE_CURRENT_SOURCE_DIR}
-    ${CMAKE_CURRENT_BINARY_DIR}
-)
-
-target_compile_features(triton-legion-backend PRIVATE cxx_std_11)
-# Note that __CUDA_NO_HALF_OPERATORS__ is for Legion's fp16 implementation,
-# it was defined in legion's code but it was ignored in this CMake build
-target_compile_options(
-  triton-legion-backend PRIVATE
-  $<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>:
-    -Wall -Wextra -Wno-unused-parameter -Wno-type-limits>
-  $<$<CXX_COMPILER_ID:MSVC>:/Wall /D_WIN32_WINNT=0x0A00 /EHsc>
-  $<$<COMPILE_LANGUAGE:CUDA>:-D__CUDA_NO_HALF_OPERATORS__>
-)
-
-if(${TRITON_ENABLE_GPU})
-  target_compile_definitions(
-    triton-legion-backend
-    PRIVATE TRITON_ENABLE_GPU=1
-    PRIVATE LEGION_USE_CUDA=1
-  )
-  # Some cuda_fp16 functions are only defined for __CUDA_ARCH__ >= 530,
-  # default is 520 which is too old
-  # FIXME expose target arch as CMake option
-  set_target_properties(triton-legion-backend PROPERTIES CUDA_ARCHITECTURES "70")
-endif() # TRITON_ENABLE_GPU
-
-set_target_properties(
-  triton-legion-backend
-  PROPERTIES
-    POSITION_INDEPENDENT_CODE ON
-    OUTPUT_NAME triton_legion
-    SKIP_BUILD_RPATH TRUE
-    BUILD_WITH_INSTALL_RPATH TRUE
-    INSTALL_RPATH_USE_LINK_PATH FALSE
-    INSTALL_RPATH "$\{ORIGIN\}"
-    LINK_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libtriton_legion.ldscript
-    LINK_FLAGS "-Wl,--version-script libtriton_legion.ldscript"
-)
-
-# Must enforce specific linking order so that the backend calls to Legion's
-# hijacked cudart APIs and then the regular APIs if not hijacked.
-target_link_libraries(
-  triton-legion-backend
-  PRIVATE
-    Legion::Legion
-)
-if(${TRITON_ENABLE_GPU})
-  target_link_libraries(
-    triton-legion-backend
-    PRIVATE
-      CUDA::cudart
-      -lcublas
-      -lcudnn
-  )
-endif() # TRITON_ENABLE_GPU
-target_link_libraries(
-  triton-legion-backend
-  PRIVATE
-    triton-core-serverapi   # from repo-core
-    triton-core-backendapi  # from repo-core
-    triton-core-serverstub  # from repo-core
-    triton-backend-utils    # from repo-backend
-    protobuf::libprotobuf
-)
-
-
-# add_dependencies(triton-legion-backend legion_proto)
-
-set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/TritonOnnxRuntimeBackend)
-
-install(
-  TARGETS
-    triton-legion-backend
-  EXPORT
-    triton-legion-backend-targets
-  LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/legion
-  RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/legion
-)
-
-if(${TRITON_LEGION_BACKEND_BUILD_TEST})
-  add_subdirectory(test test)
-endif()
diff --git a/triton/src/Makefile b/triton/src/Makefile
deleted file mode 100644
index 2e67779f4f..0000000000
--- a/triton/src/Makefile
+++ /dev/null
@@ -1,124 +0,0 @@
-#------------------------------------------------------------------------------#
-# Copyright 2022 NVIDIA CORPORATION
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#------------------------------------------------------------------------------#
-
-ifndef LG_RT_DIR
-$(error LG_RT_DIR variable is not defined, aborting build)
-endif
-ifndef TRITON_DIR
-$(error TRITON_DIR variable is not defined, aborting build)
-endif
-ifndef PROTOBUF_DIR
-$(error PROTOBUF_DIR variable is not defined, aborting build)
-endif
-
-# Flags for directing the runtime makefile what to include
-DEBUG           ?= 0		# Include debugging symbols
-MAX_DIM         ?= 4		# Maximum number of dimensions
-MAX_FIELDS	?= 256		# Maximum number of fields in a field space
-OUTPUT_LEVEL    ?= LEVEL_INFO	# Compile time logging level
-USE_FORTRAN	?= 0		# Include Fortran support		
-USE_CUDA        ?= 1		# Include CUDA support (requires CUDA)
-USE_OPENMP	?= 0		# Include OpenMP processor support
-USE_NETWORK	?= 1		# Include support for multi-node execution
-USE_ZLIB	?= 1		# Use ZLib for compression of log files
-USE_LIBDL	?= 1		# Use LibDL for finding function pointer names
-USE_LLVM	?= 0		# Include support for LLVM task variants
-USE_HDF         ?= 0		# Include HDF5 support (requires HDF5)
-USE_SPY		?= 0		# Enable support for detailed Legion Spy logging
-USE_HALF	?= 0		# Include support for half-precision reductions
-USE_COMPLEX	?= 0		# Include support for complex type reductions
-SHARED_OBJECTS	?= 0		# Generate shared objects for Legion and Realm
-BOUNDS_CHECKS	?= 0		# Enable runtime bounds checks
-PRIVILEGE_CHECKS ?= 0		# Enable runtime privilege checks
-MARCH		?= native	# Set the name of the target CPU archiecture
-GPU_ARCH	?= ampere	# Set the name of the target GPU architecture
-CONDUIT		?= ibv		# Set the name of the GASNet conduit to use
-REALM_NETWORKS	?= gasnetex	# Set the kind of networking layer to use
-GASNET		?=		# Location of GASNet installation
-CUDA		?=		# Location of CUDA installation
-HDF_ROOT	?=		# Location of HDF5 installation
-PREFIX		?= /usr		# Location of where to install Legion
-
-# Put the binary file name here
-OUTFILE		?= libtriton_legion.so
-# List all the application source files here
-CC_SRC		?=		# .c files
-CXX_SRC		?= backend.cc \
-		   model.cc \
-		   runtime.cc \
-		   instance.cc \
-		   onnx_parser.cc \
-		   onnx/onnx-data.pb.cc \
-		   onnx/onnx-ml.pb.cc \
-		   onnx/onnx-operators-ml.pb.cc \
-		   operator.cc \
-		   strategy.cc \
-		   tensor.cc \
-		   operators/unary.cc \
-		   operators/pool2d.cc \
-		   operators/concat.cc \
-		   operators/conv2d.cc \
-		   operators/matmul.cc \
-		   operators/softmax.cc \
-		   operators/reshape.cc # .cc files
-CUDA_SRC	?= operators/unary.cu
-#CUDA_SRC	?= flexflow/runtime/model.cu \
-		   flexflow/ops/attention.cu \
-		   flexflow/ops/batch_matmul.cu \
-		   flexflow/ops/batch_norm.cu \
-		   flexflow/ops/concat.cu \
-		   flexflow/ops/conv_2d.cu \
-		   flexflow/ops/dropout.cu \
-		   flexflow/ops/element_binary.cu \
-		   flexflow/ops/element_unary.cu \
-		   flexflow/ops/embedding.cu \
-		   flexflow/ops/flat.cu \
-		   flexflow/ops/fused.cu \
-		   flexflow/ops/linear.cu \
-		   flexflow/ops/pool_2d.cu \
-		   flexflow/ops/reshape.cu \
-		   flexflow/ops/reverse.cu \
-		   flexflow/ops/softmax.cu \
-		   flexflow/ops/split.cu \
-		   flexflow/ops/tanh.cu \
-		   flexflow/ops/topk.cu \
-		   flexflow/ops/transpose.cu # .cu files
-FORT_SRC	?=		# .f90 files
-ASM_SRC		?=		# .S files
-
-# You can modify these variables, some will be appended to by the runtime makefile
-INC_FLAGS	?= -I$(TRITON_DIR)/include -I$(PROTOBUF_DIR)/include # Include flags for all compilers
-CC_FLAGS	?= -fPIC	# Flags for all C++ compilers
-FC_FLAGS	?=		# Flags for all Fortran compilers
-NVCC_FLAGS	?= -Xcompiler -fPIC	# Flags for all NVCC files
-SO_FLAGS	?=		# Flags for building shared objects
-LD_FLAGS	?= -shared -Wl,--version-script libtriton_legion.ldscript -L$(PROTOBUF_DIR)/lib -lprotobuf -L$(TRITON_DIR)/lib -ltritonasyncworkqueue -ltritonbackendutils -ltritonserver_stub # Flags for linking binaries
-# Canonical GNU flags you can modify as well
-CPPFLAGS 	?=
-CFLAGS		?=
-CXXFLAGS 	?=
-FFLAGS 		?=
-LDLIBS 		?=
-LDFLAGS 	?=
-
-###########################################################################
-#
-#   Don't change anything below here
-#
-###########################################################################
-
-include $(LG_RT_DIR)/runtime.mk
-
diff --git a/triton/src/accessor.h b/triton/src/accessor.h
deleted file mode 100644
index 365e032675..0000000000
--- a/triton/src/accessor.h
+++ /dev/null
@@ -1,117 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __LEGION_TRITON_ACCESSOR_H__
-#define __LEGION_TRITON_ACCESSOR_H__
-
-#include "legion.h"
-#include "types.h"
-
-namespace triton { namespace backend { namespace legion {
-
-template <Legion::PrivilegeMode MODE, int DIM>
-class TensorAccessor {
- public:
-  static inline void* access(
-      DataType type, const Legion::Rect<DIM>& bounds,
-      const Legion::PhysicalRegion& region)
-  {
-    // Legion doesn't understand types, it just knows about field
-    // sizes so we just need to use types of the right size
-    switch (sizeof_datatype(type)) {
-      case 1: {
-        Legion::FieldAccessor<
-            MODE, int8_t, DIM, Legion::coord_t,
-            Realm::AffineAccessor<int8_t, DIM, Legion::coord_t> >
-            accessor(region, FID_DATA);
-        return accessor.ptr(bounds);
-      }
-      case 2: {
-        Legion::FieldAccessor<
-            MODE, int16_t, DIM, Legion::coord_t,
-            Realm::AffineAccessor<int16_t, DIM, Legion::coord_t> >
-            accessor(region, FID_DATA);
-        return accessor.ptr(bounds);
-      }
-      case 4: {
-        Legion::FieldAccessor<
-            MODE, int32_t, DIM, Legion::coord_t,
-            Realm::AffineAccessor<int32_t, DIM, Legion::coord_t> >
-            accessor(region, FID_DATA);
-        return accessor.ptr(bounds);
-      }
-      case 8: {
-        Legion::FieldAccessor<
-            MODE, int64_t, DIM, Legion::coord_t,
-            Realm::AffineAccessor<int64_t, DIM, Legion::coord_t> >
-            accessor(region, FID_DATA);
-        return accessor.ptr(bounds);
-      }
-      default:
-        assert(false);
-    }
-    return nullptr;
-  };
-};
-
-// specialization for read-only privileges to return a const void*
-template <int DIM>
-class TensorAccessor<LEGION_READ_ONLY, DIM> {
- public:
-  static inline const void* access(
-      DataType type, const Legion::Rect<DIM>& bounds,
-      const Legion::PhysicalRegion& region)
-  {
-    // Legion doesn't understand types, it just knows about field
-    // sizes so we just need to use types of the right size
-    switch (sizeof_datatype(type)) {
-      case 1: {
-        Legion::FieldAccessor<
-            LEGION_READ_ONLY, int8_t, DIM, Legion::coord_t,
-            Realm::AffineAccessor<int8_t, DIM, Legion::coord_t> >
-            accessor(region, FID_DATA);
-        return accessor.ptr(bounds);
-      }
-      case 2: {
-        Legion::FieldAccessor<
-            LEGION_READ_ONLY, int16_t, DIM, Legion::coord_t,
-            Realm::AffineAccessor<int16_t, DIM, Legion::coord_t> >
-            accessor(region, FID_DATA);
-        return accessor.ptr(bounds);
-      }
-      case 4: {
-        Legion::FieldAccessor<
-            LEGION_READ_ONLY, int32_t, DIM, Legion::coord_t,
-            Realm::AffineAccessor<int32_t, DIM, Legion::coord_t> >
-            accessor(region, FID_DATA);
-        return accessor.ptr(bounds);
-      }
-      case 8: {
-        Legion::FieldAccessor<
-            LEGION_READ_ONLY, int64_t, DIM, Legion::coord_t,
-            Realm::AffineAccessor<int64_t, DIM, Legion::coord_t> >
-            accessor(region, FID_DATA);
-        return accessor.ptr(bounds);
-      }
-      default:
-        assert(false);
-    }
-    return nullptr;
-  };
-};
-
-}}}  // namespace triton::backend::legion
-
-#endif  // __LEGION_TRITON_ACCESSOR_H__
diff --git a/triton/src/backend.cc b/triton/src/backend.cc
deleted file mode 100644
index b4204db4f2..0000000000
--- a/triton/src/backend.cc
+++ /dev/null
@@ -1,311 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "instance.h"
-#include "legion.h"
-#include "model.h"
-#include "operator.h"
-#include "runtime.h"
-
-namespace triton { namespace backend { namespace legion {
-
-//
-// Legion backend implementation
-//
-
-/////////////
-
-extern "C" {
-
-// Implementing TRITONBACKEND_Initialize is optional. The backend
-// should initialize any global state that is intended to be shared
-// across all models and model instances that use the backend.
-TRITONSERVER_Error*
-TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend)
-{
-  // TODO: Currently assume this is called collectively
-  const char* cname;
-  RETURN_IF_ERROR(TRITONBACKEND_BackendName(backend, &cname));
-  std::string name(cname);
-
-  LOG_MESSAGE(
-      TRITONSERVER_LOG_INFO,
-      (std::string("TRITONBACKEND_Initialize: ") + name).c_str());
-
-  // We should check the backend API version that Triton supports
-  // vs. what this backend was compiled against.
-  uint32_t api_version_major, api_version_minor;
-  RETURN_IF_ERROR(
-      TRITONBACKEND_ApiVersion(&api_version_major, &api_version_minor));
-
-  LOG_MESSAGE(
-      TRITONSERVER_LOG_INFO,
-      (std::string("Triton TRITONBACKEND API version: ") +
-       std::to_string(api_version_major) + "." +
-       std::to_string(api_version_minor))
-          .c_str());
-  LOG_MESSAGE(
-      TRITONSERVER_LOG_INFO,
-      (std::string("'") + name + "' TRITONBACKEND API version: " +
-       std::to_string(TRITONBACKEND_API_VERSION_MAJOR) + "." +
-       std::to_string(TRITONBACKEND_API_VERSION_MINOR))
-          .c_str());
-
-  if ((api_version_major != TRITONBACKEND_API_VERSION_MAJOR) ||
-      (api_version_minor < TRITONBACKEND_API_VERSION_MINOR)) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_UNSUPPORTED,
-        "triton backend API version does not support this backend");
-  }
-
-  // Perform preregistration of Legion resources before starting execution
-  const Legion::TaskID top_task_id = Legion::Runtime::generate_static_task_id();
-
-  // Get the command line arguments to pass to Legion
-  TRITONSERVER_Message* backend_config_message;
-  RETURN_IF_ERROR(
-      TRITONBACKEND_BackendConfig(backend, &backend_config_message));
-
-  const char* buffer;
-  size_t byte_size;
-  RETURN_IF_ERROR(TRITONSERVER_MessageSerializeToJson(
-      backend_config_message, &buffer, &byte_size));
-
-  common::TritonJson::Value cmdline_args;
-  TRITONSERVER_Error* err = cmdline_args.Parse(buffer, byte_size);
-  RETURN_IF_ERROR(err);
-
-  std::vector<char*>* args = new std::vector<char*>();
-  // We'll fake it for now and leak the pointers
-  args->push_back(strdup("tritonserver"));
-  if (cmdline_args.Find("cmdline")) {
-    // TODO: parse the command line arguments to get configuration args
-  }
-
-  // Preregister the operator tasks before we start the runtime
-  Operator::PreregisterTaskVariants();
-
-  // Start the Legion runtime
-  if (Legion::Runtime::start(
-          args->size(), &(args->front()), true /*background*/))
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_UNKNOWN, "failed to start Legion runtime");
-
-  LegionTritonRuntime* runtime;
-  RETURN_IF_ERROR(LegionTritonRuntime::Create(top_task_id, &runtime));
-  RETURN_IF_ERROR(
-      TRITONBACKEND_BackendSetState(backend, reinterpret_cast<void*>(runtime)));
-
-  return nullptr;  // success
-}
-
-// Implementing TRITONBACKEND_Finalize is optional unless state is set
-// using TRITONBACKEND_BackendSetState. The backend must free this
-// state and perform any other global cleanup.
-TRITONSERVER_Error*
-TRITONBACKEND_Finalize(TRITONBACKEND_Backend* backend)
-{
-  LOG_MESSAGE(
-      TRITONSERVER_LOG_INFO,
-      "TRITONBACKEND_Finalize: deleting Legion backend state");
-
-  // Wait for the Legion runtime to shutdown
-  if (Legion::Runtime::wait_for_shutdown())
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_UNKNOWN, "failed to shutdown Legion runtime");
-
-  void* runtimestate;
-  RETURN_IF_ERROR(TRITONBACKEND_BackendState(backend, &runtimestate));
-  LegionTritonRuntime* runtime =
-      reinterpret_cast<LegionTritonRuntime*>(runtimestate);
-  // Only once we are shutdown can we delete this
-  delete runtime;
-
-  LOG_MESSAGE(
-      TRITONSERVER_LOG_VERBOSE,
-      "TRITONBACKEND_Finalize: deleted Legion backend state");
-
-  return nullptr;  // success
-}
-
-// Implementing TRITONBACKEND_ModelInitialize is optional. The backend
-// should initialize any state that is intended to be shared across
-// all instances of the model.
-TRITONSERVER_Error*
-TRITONBACKEND_ModelInitialize(TRITONBACKEND_Model* model)
-{
-  // TODO: Currently assume this is called collectively
-  const char* cname;
-  RETURN_IF_ERROR(TRITONBACKEND_ModelName(model, &cname));
-  std::string name(cname);
-
-  uint64_t version;
-  RETURN_IF_ERROR(TRITONBACKEND_ModelVersion(model, &version));
-
-  LOG_MESSAGE(
-      TRITONSERVER_LOG_INFO,
-      (std::string("TRITONBACKEND_ModelInitialize: ") + name + " (version " +
-       std::to_string(version) + ")")
-          .c_str());
-
-  // Can get location of the model artifacts. Normally we would need
-  // to check the artifact type to make sure it was something we can
-  // handle... but we are just going to log the location so we don't
-  // need the check. We would use the location if we wanted to load
-  // something from the model's repo.
-  TRITONBACKEND_ArtifactType artifact_type;
-  const char* clocation;
-  RETURN_IF_ERROR(
-      TRITONBACKEND_ModelRepository(model, &artifact_type, &clocation));
-  LOG_MESSAGE(
-      TRITONSERVER_LOG_INFO,
-      (std::string("Repository location: ") + clocation).c_str());
-
-  // The model can access the backend as well... here we can access
-  // the backend global state.
-  TRITONBACKEND_Backend* backend;
-  RETURN_IF_ERROR(TRITONBACKEND_ModelBackend(model, &backend));
-
-  void* runtimestate;
-  RETURN_IF_ERROR(TRITONBACKEND_BackendState(backend, &runtimestate));
-  LegionTritonRuntime* runtime =
-      reinterpret_cast<LegionTritonRuntime*>(runtimestate);
-
-  // With each model we create a LegionModelState object and associate it
-  // with the TRITONBACKEND_Model.
-  LegionModelState* model_state;
-  RETURN_IF_ERROR(
-      LegionModelState::Create(model, name, version, runtime, &model_state));
-  RETURN_IF_ERROR(
-      TRITONBACKEND_ModelSetState(model, reinterpret_cast<void*>(model_state)));
-
-  return nullptr;  // success
-}
-
-// Implementing TRITONBACKEND_ModelFinalize is optional unless state
-// is set using TRITONBACKEND_ModelSetState. The backend must free
-// this state and perform any other cleanup.
-TRITONSERVER_Error*
-TRITONBACKEND_ModelFinalize(TRITONBACKEND_Model* model)
-{
-  // TODO: Currently assume this is called collectively
-  LOG_MESSAGE(
-      TRITONSERVER_LOG_INFO,
-      "TRITONBACKEND_ModelFinalize: deleting Legion model state");
-
-  void* vstate;
-  RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vstate));
-  LegionModelState* model_state = reinterpret_cast<LegionModelState*>(vstate);
-  delete model_state;
-
-  LOG_MESSAGE(
-      TRITONSERVER_LOG_VERBOSE,
-      "TRITONBACKEND_ModelInstanceFinalize: deleted Legion model state");
-
-  return nullptr;  // success
-}
-
-// Implementing TRITONBACKEND_ModelInstanceInitialize is optional. The
-// backend should initialize any state that is required for a model
-// instance.
-TRITONSERVER_Error*
-TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance)
-{
-  // TODO: Currently assume this is called collectively
-  // The instance can access the corresponding model as well... here
-  // we get the model and from that get the model's state.
-  TRITONBACKEND_Model* model;
-  RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceModel(instance, &model));
-
-  void* vmodelstate;
-  RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vmodelstate));
-  LegionModelState* model_state =
-      reinterpret_cast<LegionModelState*>(vmodelstate);
-
-  // With each instance we create a LegionModelInstance object and
-  // associate it with the TRITONBACKEND_ModelInstance.
-  LegionModelInstance* instance_state;
-  RETURN_IF_ERROR(
-      LegionModelInstance::Create(instance, model_state, &instance_state));
-  RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceSetState(
-      instance, reinterpret_cast<void*>(instance_state)));
-
-  return nullptr;  // success
-}
-
-// Implementing TRITONBACKEND_ModelInstanceFinalize is optional unless
-// state is set using TRITONBACKEND_ModelInstanceSetState. The backend
-// must free this state and perform any other cleanup.
-TRITONSERVER_Error*
-TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance)
-{
-  // TODO: Currently assume this is called collectively
-  LOG_MESSAGE(
-      TRITONSERVER_LOG_INFO,
-      "TRITONBACKEND_ModelInstanceFinalize: deleting Legion instance state");
-
-  LegionModelInstance* instance_state;
-  RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(
-      instance, reinterpret_cast<void**>(&instance_state)));
-  delete instance_state;
-
-  LOG_MESSAGE(
-      TRITONSERVER_LOG_VERBOSE,
-      "TRITONBACKEND_ModelInstanceFinalize: deleted Legion instance state");
-
-  return nullptr;  // success
-}
-
-TRITONBACKEND_ISPEC TRITONSERVER_Error*
-TRITONBACKEND_ModelInstanceExecute(
-    TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request** requests,
-    const uint32_t request_count)
-{
-  // Triton will not call this function simultaneously for the same
-  // 'instance'. But since this backend could be used by multiple
-  // instances from multiple models the implementation needs to handle
-  // multiple calls to this function at the same time (with different
-  // 'instance' objects). Suggested practice for this is to use only
-  // function-local and model-instance-specific state (obtained from
-  // 'instance'), which is what we do here.
-  LegionModelInstance* instance_state;
-  RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(
-      instance, reinterpret_cast<void**>(&instance_state)));
-
-  // This backend specifies BLOCKING execution policy. That means that
-  // we should not return from this function until execution is
-  // complete. Triton will automatically release 'instance' on return
-  // from this function so that it is again available to be used for
-  // another call to TRITONBACKEND_ModelInstanceExecute.
-  LOG_MESSAGE(
-      TRITONSERVER_LOG_VERBOSE,
-      (std::string("model ") + instance_state->Model()->Name() + ", instance " +
-       instance_state->Name() + ", executing " + std::to_string(request_count) +
-       " requests")
-          .c_str());
-
-  // At this point we accept ownership of 'requests', which means that
-  // even if something goes wrong we must still return success from
-  // this function. If something does go wrong in processing a
-  // particular request then we send an error response just for the
-  // specific request.
-  instance_state->ProcessRequests(requests, request_count);
-
-  return nullptr;  // success
-}
-
-}  // extern "C"
-
-}}}  // namespace triton::backend::legion
diff --git a/triton/src/common.h b/triton/src/common.h
deleted file mode 100644
index 0a59d2385d..0000000000
--- a/triton/src/common.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __LEGION_TRITON_COMMON_H__
-#define __LEGION_TRITON_COMMON_H__
-
-#include <string>
-#include "triton/core/tritonserver.h"
-#include "types.h"
-
-namespace triton { namespace backend { namespace legion {
-
-DataType
-ToDataType(const TRITONSERVER_DataType type)
-{
-  switch (type) {
-    case TRITONSERVER_TYPE_FP16:
-      return DT_HALF;
-    case TRITONSERVER_TYPE_FP32:
-      return DT_FLOAT;
-    case TRITONSERVER_TYPE_FP64:
-      return DT_DOUBLE;
-    case TRITONSERVER_TYPE_INT8:
-      return DT_INT8;
-    case TRITONSERVER_TYPE_INT16:
-      return DT_INT16;
-    case TRITONSERVER_TYPE_INT32:
-      return DT_INT32;
-    case TRITONSERVER_TYPE_INT64:
-      return DT_INT64;
-    case TRITONSERVER_TYPE_UINT8:
-      return DT_UINT8;
-    case TRITONSERVER_TYPE_UINT16:
-      return DT_UINT16;
-    case TRITONSERVER_TYPE_UINT32:
-      return DT_UINT32;
-    case TRITONSERVER_TYPE_UINT64:
-      return DT_UINT64;
-    case TRITONSERVER_TYPE_BOOL:
-      return DT_BOOLEAN;
-    case TRITONSERVER_TYPE_INVALID:
-    case TRITONSERVER_TYPE_BYTES:
-    default:
-      return DT_NONE;
-  }
-}
-
-TRITONSERVER_DataType
-ToTritonDataType(const DataType type)
-{
-  switch (type) {
-    case DT_HALF:
-      return TRITONSERVER_TYPE_FP16;
-    case DT_FLOAT:
-      return TRITONSERVER_TYPE_FP32;
-    case DT_DOUBLE:
-      return TRITONSERVER_TYPE_FP64;
-    case DT_INT8:
-      return TRITONSERVER_TYPE_INT8;
-    case DT_INT16:
-      return TRITONSERVER_TYPE_INT16;
-    case DT_INT32:
-      return TRITONSERVER_TYPE_INT32;
-    case DT_INT64:
-      return TRITONSERVER_TYPE_INT64;
-    case DT_UINT8:
-      return TRITONSERVER_TYPE_UINT8;
-    case DT_UINT16:
-      return TRITONSERVER_TYPE_UINT16;
-    case DT_UINT32:
-      return TRITONSERVER_TYPE_UINT32;
-    case DT_UINT64:
-      return TRITONSERVER_TYPE_UINT64;
-    case DT_BOOLEAN:
-      return TRITONSERVER_TYPE_BOOL;
-    case DT_NONE:
-    default:
-      return TRITONSERVER_TYPE_INVALID;
-  }
-}
-
-std::string
-DataTypeString(const DataType type)
-{
-  switch (type) {
-    case DT_HALF:
-      return "DT_HALF";
-    case DT_FLOAT:
-      return "DT_FLOAT";
-    case DT_DOUBLE:
-      return "DT_DOUBLE";
-    case DT_INT8:
-      return "DT_INT8";
-    case DT_INT16:
-      return "DT_INT16";
-    case DT_INT32:
-      return "DT_INT32";
-    case DT_INT64:
-      return "DT_INT64";
-    case DT_UINT8:
-      return "DT_UINT8";
-    case DT_UINT16:
-      return "DT_UINT16";
-    case DT_UINT32:
-      return "DT_UINT32";
-    case DT_UINT64:
-      return "DT_UINT64";
-    case DT_BOOLEAN:
-      return "DT_BOOLEAN";
-    case DT_NONE:
-    default:
-      return "DT_NONE";
-  }
-}
-
-}}}  // namespace triton::backend::legion
-
-#endif  // __LEGION_TRITON_COMMON_H__
diff --git a/triton/src/config.h b/triton/src/config.h
deleted file mode 100644
index e1c346a596..0000000000
--- a/triton/src/config.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __LEGION_TRITON_CONFIG_H__
-#define __LEGION_TRITON_CONFIG_H__
-
-// Configuration constants for upper bounds for some static properties
-
-// Maximum number of instances per model that we expect to see
-#define MAX_NUM_INSTANCES 8
-
-// Maximum number of local processors that we need to handle in this process
-#define MAX_LOCAL_PROCS 16
-
-#endif  // __LEGION_TRITON_CONFIG_H__
diff --git a/triton/src/cudahelp.h b/triton/src/cudahelp.h
deleted file mode 100644
index 8701a16fb2..0000000000
--- a/triton/src/cudahelp.h
+++ /dev/null
@@ -1,228 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __LEGION_TRITON_CUDAHELP_H__
-#define __LEGION_TRITON_CUDAHELP_H__
-
-#define __CUDA_NO_HALF_OPERATORS__
-
-#include "cublas_v2.h"
-#include "cuda_runtime.h"
-#include "cudnn.h"
-#include "types.h"
-
-#define FatalError(s)                                                 \
-  do {                                                                \
-    std::stringstream _message;                                       \
-    _message << std::string(s) + "\n" << __FILE__ << ':' << __LINE__; \
-    std::cerr << _message.str() << "\nAborting...\n";                 \
-    abort();                                                          \
-  } while (0)
-
-#define CHECK_CUDNN(cmd)                              \
-  do {                                                \
-    cudnnStatus_t status = (cmd);                     \
-    if (status != CUDNN_STATUS_SUCCESS) {             \
-      std::stringstream _error;                       \
-      _error << "CUDNN failure (" << status           \
-             << "): " << cudnnGetErrorString(status); \
-      FatalError(_error.str());                       \
-    }                                                 \
-  } while (0)
-
-#define CHECK_CURAND(cmd)                     \
-  do {                                        \
-    curandStatus_t status = (cmd);            \
-    if (status != CURAND_STATUS_SUCCESS) {    \
-      std::stringstream _error;               \
-      _error << "CURAND failure: " << status; \
-      FatalError(_error.str());               \
-    }                                         \
-  } while (0)
-
-#define CHECK_CUDA(cmd)                              \
-  do {                                               \
-    cudaError_t status = (cmd);                      \
-    if (status != cudaSuccess) {                     \
-      std::stringstream _error;                      \
-      _error << "CUDA failure (" << status           \
-             << "): " << cudaGetErrorString(status); \
-      FatalError(_error.str());                      \
-    }                                                \
-  } while (0)
-
-#define CHECK_CUBLAS(cmd)                     \
-  do {                                        \
-    cublasStatus_t status = (cmd);            \
-    if (status != CUBLAS_STATUS_SUCCESS) {    \
-      std::stringstream _error;               \
-      _error << "CUBLAS failure: " << status; \
-      FatalError(_error.str());               \
-    }                                         \
-  } while (0)
-
-#define CHECK_NCCL(cmd)                              \
-  do {                                               \
-    ncclResult_t status = (cmd);                     \
-    if (status != ncclSuccess) {                     \
-      std::stringstream _error;                      \
-      _error << "NCCL failure (" << status           \
-             << "): " << ncclGetErrorString(status); \
-      FatalError(_error.str());                      \
-    }                                                \
-  } while (0)
-
-#ifndef THREADS_PER_BLOCK
-#define THREADS_PER_BLOCK 256
-#endif
-
-namespace triton { namespace backend { namespace legion {
-
-inline cudnnDataType_t
-to_cudnn_datatype(DataType type)
-{
-  switch (type) {
-    case DT_HALF:
-      return CUDNN_DATA_HALF;
-    case DT_FLOAT:
-      return CUDNN_DATA_FLOAT;
-    case DT_DOUBLE:
-      return CUDNN_DATA_DOUBLE;
-    case DT_INT8:
-      return CUDNN_DATA_INT8;
-    case DT_INT32:
-      return CUDNN_DATA_INT32;
-    case DT_UINT8:
-      return CUDNN_DATA_UINT8;
-    default:
-      abort();
-  }
-  return CUDNN_DATA_FLOAT;
-}
-
-inline cudaDataType_t
-to_cuda_datatype(DataType type)
-{
-  switch (type) {
-    case DT_HALF:
-      return CUDA_R_16F;
-    case DT_FLOAT:
-      return CUDA_R_32F;
-    case DT_DOUBLE:
-      return CUDA_R_64F;
-    case DT_INT8:
-      return CUDA_R_8I;
-    case DT_INT32:
-      return CUDA_R_32I;
-    case DT_UINT8:
-      return CUDA_R_8U;
-    case DT_UINT32:
-      return CUDA_R_32U;
-    default:
-      abort();
-  }
-  return CUDA_R_32F;
-}
-
-// To use cudnnOpTensor(), some data type combination is required,
-// CUDNN_DATA_UINT8 will be returned if the combination is not supported
-// https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnOpTensor
-inline cudnnDataType_t
-to_op_tensor_comp_type(DataType input0, DataType input1, DataType output)
-{
-  if (input0 != input1) {
-    return CUDNN_DATA_UINT8;
-  }
-  switch (output) {
-    case DataType::DT_FLOAT: {
-      switch (input0) {
-        case DataType::DT_FLOAT:
-        case DataType::DT_HALF:
-        case DataType::DT_INT8:
-          return CUDNN_DATA_FLOAT;
-        default:
-          return CUDNN_DATA_UINT8;
-      }
-    }
-    case DataType::DT_HALF:
-    case DataType::DT_INT8:
-      return ((input0 == output) || (input0 == DataType::DT_FLOAT))
-                 ? CUDNN_DATA_FLOAT
-                 : CUDNN_DATA_UINT8;
-    case DataType::DT_DOUBLE:
-      return (input0 == DataType::DT_DOUBLE) ? CUDNN_DATA_DOUBLE
-                                             : CUDNN_DATA_UINT8;
-    case DataType::DT_INT16:
-    case DataType::DT_INT32:
-    case DataType::DT_INT64:
-    case DataType::DT_UINT8:
-    case DataType::DT_UINT16:
-    case DataType::DT_UINT32:
-    case DataType::DT_UINT64:
-    case DataType::DT_BOOLEAN:
-    case DataType::DT_NONE:
-      return CUDNN_DATA_UINT8;
-  }
-  return CUDNN_DATA_UINT8;
-}
-
-inline cudnnStatus_t
-cudnnSetTensorDescriptorFromDomain(
-    cudnnTensorDescriptor_t tensor, Legion::Domain domain, DataType type,
-    cudnnTensorFormat_t format = CUDNN_TENSOR_NCHW)
-{
-  int dims[4];
-  switch (domain.get_dim()) {
-    case 1: {
-      Legion::Rect<1> rect = domain;
-      dims[0] = rect.hi[0] - rect.lo[0] + 1;
-      return cudnnSetTensor4dDescriptor(
-          tensor, format, to_cudnn_datatype(type), 1, 1, 1, dims[0]);
-    }
-    case 2: {
-      Legion::Rect<2> rect = domain;
-      dims[0] = rect.hi[0] - rect.lo[0] + 1;
-      dims[1] = rect.hi[1] - rect.lo[1] + 1;
-      return cudnnSetTensor4dDescriptor(
-          tensor, format, to_cudnn_datatype(type), 1, 1, dims[0], dims[1]);
-    }
-    case 3: {
-      Legion::Rect<3> rect = domain;
-      dims[0] = rect.hi[0] - rect.lo[0] + 1;
-      dims[1] = rect.hi[1] - rect.lo[1] + 1;
-      dims[2] = rect.hi[2] - rect.lo[2] + 1;
-      return cudnnSetTensor4dDescriptor(
-          tensor, format, to_cudnn_datatype(type), 1, dims[0], dims[1],
-          dims[2]);
-    }
-    case 4: {
-      Legion::Rect<4> rect = domain;
-      dims[0] = rect.hi[0] - rect.lo[0] + 1;
-      dims[1] = rect.hi[1] - rect.lo[1] + 1;
-      dims[2] = rect.hi[2] - rect.lo[2] + 1;
-      dims[3] = rect.hi[3] - rect.lo[3] + 1;
-      return cudnnSetTensor4dDescriptor(
-          tensor, format, to_cudnn_datatype(type), dims[0], dims[1], dims[2],
-          dims[3]);
-    }
-    default:
-      abort();
-  }
-  return CUDNN_STATUS_BAD_PARAM;
-}
-
-}}}  // namespace triton::backend::legion
-
-#endif  // __LEGION_TRITON_CUDAHELP_H__
diff --git a/triton/src/instance.cc b/triton/src/instance.cc
deleted file mode 100644
index 3d12e2bf48..0000000000
--- a/triton/src/instance.cc
+++ /dev/null
@@ -1,811 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "instance.h"
-#include "strategy.h"
-#include "tensor.h"
-
-#define RESPOND_ALL_AND_RETURN_IF_ERROR(RET, RESPONSES, RESPONSES_COUNT, X) \
-  do {                                                                      \
-    TRITONSERVER_Error* raarie_err__ = (X);                                 \
-    if (raarie_err__ != nullptr) {                                          \
-      SendErrorForResponses(RESPONSES, RESPONSES_COUNT, raarie_err__);      \
-      return RET;                                                           \
-    }                                                                       \
-  } while (false)
-
-#define GUARDED_RESPOND_IF_ERROR(RESPONSES, IDX, X)                     \
-  do {                                                                  \
-    if ((RESPONSES)[IDX] != nullptr) {                                  \
-      TRITONSERVER_Error* err__ = (X);                                  \
-      if (err__ != nullptr) {                                           \
-        LOG_IF_ERROR(                                                   \
-            TRITONBACKEND_ResponseSend(                                 \
-                (RESPONSES)[IDX], TRITONSERVER_RESPONSE_COMPLETE_FINAL, \
-                err__),                                                 \
-            "failed to send error response");                           \
-        (RESPONSES)[IDX] = nullptr;                                     \
-        TRITONSERVER_ErrorDelete(err__);                                \
-      }                                                                 \
-    }                                                                   \
-  } while (false)
-
-using namespace Legion;
-
-namespace triton { namespace backend { namespace legion {
-
-TRITONSERVER_Error*
-LegionModelInstance::Create(
-    TRITONBACKEND_ModelInstance* triton_model_instance,
-    LegionModelState* model_state, LegionModelInstance** state)
-{
-  // Make a user event to denote when the context will be ready for this
-  // instance
-  Realm::UserEvent context_ready = Realm::UserEvent::create_user_event();
-  unsigned index = model_state->ReserveInstance();
-  try {
-    *state = new LegionModelInstance(
-        triton_model_instance, model_state, index, context_ready);
-  }
-  catch (const BackendModelInstanceException& ex) {
-    RETURN_ERROR_IF_TRUE(
-        ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL,
-        std::string("unexpected nullptr in BackendModelInstanceException"));
-    RETURN_IF_ERROR(ex.err_);
-  }
-  model_state->RecordInstance(*state);
-  model_state->runtime_->RendezvousContextCreation(*state, context_ready);
-
-  return nullptr;  // success
-}
-
-LegionModelInstance::~LegionModelInstance()
-{
-  // Finish the implicit top-level task associated with this instance
-  Bind();
-  model_state_->finalize(this, index_, runtime_, context_, mapper_);
-  for (std::vector<LogicalRegion>::const_iterator it =
-           top_level_regions.begin();
-       it != top_level_regions.end(); it++)
-    runtime_->destroy_logical_region(context_, *it);
-  for (std::map<DataType, FieldSpace>::const_iterator it =
-           top_level_field_spaces.begin();
-       it != top_level_field_spaces.end(); it++)
-    runtime_->destroy_field_space(context_, it->second);
-  for (std::map<Domain, IndexSpace>::const_iterator it =
-           top_level_index_spaces.end();
-       it != top_level_index_spaces.end(); it++)
-    runtime_->destroy_index_space(context_, it->second);
-  // FIXME: find a way to tell Legion to delete our mapper
-  runtime_->finish_implicit_task(context_);
-}
-
-LegionModelInstance::LegionModelInstance(
-    TRITONBACKEND_ModelInstance* triton_model_instance,
-    LegionModelState* model_state, unsigned index, Realm::Event ready)
-    : BackendModelInstance(model_state, triton_model_instance),
-      runtime_(model_state->runtime_->legion_), model_state_(model_state),
-      index_(index), context_ready_(ready), mapper_(0)
-{
-  execution_barrier_ = Realm::Barrier::NO_BARRIER;
-}
-
-void
-LegionModelInstance::CreateContext(
-    Runtime* runtime, TaskID tid, unsigned rank, size_t total_ranks,
-    Realm::Event precondition, bool owner_instance)
-{
-  context_ = runtime->begin_implicit_task(
-      tid, 0 /*default mapper to bootstrap only*/, Processor::LOC_PROC /*CPU*/,
-      "Inference Task", true /*control replicable*/, 1 /*shard per process*/,
-      rank /*shard id*/);
-  // Create a unique mapper ID and mapper for this instance and then load the
-  // mapper
-  assert(mapper_ == 0);
-  // this will generate the same ID across the shards
-  mapper_ = runtime->generate_dynamic_mapper_id();
-  assert(mapper_ != 0);
-  StrategyMapper* mapper = new StrategyMapper(
-      model_state_->GetStrategy(), runtime->get_mapper_runtime(),
-      Machine::get_machine());
-  // Register this mapper with all the processors on the local node
-  runtime_->add_mapper(mapper_, mapper);
-
-  model_state_->initialize(this, index_, runtime_, context_, mapper_);
-  // we can immediately unbind from this context
-  Unbind();
-  // Check to see if we'll be the owner for managing execution
-  assert(!execution_barrier_.exists());
-  if (owner_instance) {
-    execution_barrier_ = Realm::Barrier::create_barrier(total_ranks);
-    // The first generation is just our normal precondition
-    execution_barrier_.arrive(total_ranks, precondition);
-    execution_barrier_ = execution_barrier_.advance_barrier();
-  }
-}
-
-Realm::Barrier
-LegionModelInstance::GetExecutionBarrier(
-    size_t total_ranks, Realm::Event& precondition, bool external,
-    bool need_lock)
-{
-  if (need_lock) {
-    if (external) {
-      AutoLock<true> lock(lock_);
-      return GetExecutionBarrier(total_ranks, precondition, true, false);
-    } else {
-      AutoLock<false> lock(lock_);
-      return GetExecutionBarrier(total_ranks, precondition, false, false);
-    }
-  }
-  // This better exist if we're here
-  assert(execution_barrier_.exists());
-  precondition = execution_barrier_.get_previous_phase();
-  const Realm::Barrier result = execution_barrier_;
-  execution_barrier_ = execution_barrier_.advance_barrier();
-  if (!execution_barrier_.exists()) {
-    // Handle the case where we run out of barrier generations
-    execution_barrier_ = Realm::Barrier::create_barrier(total_ranks);
-    // Chain the barriers together in order
-    execution_barrier_.arrive(total_ranks, result);
-    execution_barrier_ = execution_barrier_.advance_barrier();
-  }
-  return result;
-}
-
-void
-LegionModelInstance::RunModel(
-    const std::vector<InputTensor>& inputs,
-    const std::vector<OutputTensor>& outputs,
-    std::vector<uint64_t>& compute_input_end_ns,
-    std::vector<uint64_t>& compute_output_start_ns, bool distributed)
-{
-  if (!distributed) {
-    LegionTritonRuntime* runtime = model_state_->runtime_;
-    runtime->DistributeRunModel(
-        model_state_->name, model_state_->version, index_, inputs, outputs,
-        compute_input_end_ns, compute_output_start_ns, runtime->rank_, this);
-    return;
-  }
-  AutoBind binding(this);
-  model_state_->forward(
-      this, index_, runtime_, context_, mapper_, inputs, outputs,
-      compute_input_end_ns, compute_output_start_ns);
-}
-
-void
-LegionModelInstance::ProcessRequests(
-    TRITONBACKEND_Request** requests, const uint32_t request_count)
-{
-  LOG_MESSAGE(
-      TRITONSERVER_LOG_VERBOSE,
-      (std::string("TRITONBACKEND_ModelExecute: Running ") + Name() + " with " +
-       std::to_string(request_count) + " requests")
-          .c_str());
-
-  uint64_t request_start_ns = 0;
-  SET_TIMESTAMP(request_start_ns);
-
-  const int max_batch_size = Model()->MaxBatchSize();
-
-  // For each request collect the total batch size for this inference
-  // execution. The batch-size, number of inputs, and size of each
-  // input has already been checked so don't need to do that here.
-  size_t total_batch_size = 0;
-  std::vector<size_t> request_batch_sizes;
-  for (size_t i = 0; i < request_count; i++) {
-    // If we get a nullptr request then something is badly wrong. Fail
-    // and release all requests.
-    if (requests[i] == nullptr) {
-      RequestsRespondWithError(
-          requests, request_count,
-          TRITONSERVER_ErrorNew(
-              TRITONSERVER_ERROR_INTERNAL,
-              std::string(
-                  "null request given to Legion backend for '" + Name() + "'")
-                  .c_str()));
-      return;
-    }
-
-    if (max_batch_size > 0) {
-      // Retrieve the batch size from one of the inputs, if the model
-      // supports batching, the first dimension size is batch size
-      TRITONBACKEND_Input* input;
-      TRITONSERVER_Error* err =
-          TRITONBACKEND_RequestInputByIndex(requests[i], 0 /* index */, &input);
-      if (err == nullptr) {
-        const int64_t* shape;
-        err = TRITONBACKEND_InputProperties(
-            input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr);
-        total_batch_size += shape[0];
-        request_batch_sizes.emplace_back(shape[0]);
-      }
-      if (err != nullptr) {
-        RequestsRespondWithError(requests, request_count, err);
-        return;
-      }
-    } else {
-      total_batch_size += 1;
-      request_batch_sizes.emplace_back(1);
-    }
-  }
-
-  // If there are no valid payloads then no need to run the inference.
-  if (total_batch_size == 0) {
-    return;
-  }
-
-  // Make sure the maximum batch size is not exceeded. The
-  // total_batch_size must be 1 for models that don't support batching
-  // (i.e. max_batch_size == 0). If max_batch_size is exceeded then
-  // scheduler has done something badly wrong so fail and release all
-  // requests.
-  if ((total_batch_size != 1) && (total_batch_size > (size_t)max_batch_size)) {
-    RequestsRespondWithError(
-        requests, request_count,
-        TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_INTERNAL,
-            std::string(
-                "batch size " + std::to_string(total_batch_size) + " for '" +
-                Name() + "', max allowed is " + std::to_string(max_batch_size))
-                .c_str()));
-    return;
-  }
-
-  // At this point we are committed to running inference with all
-  // 'requests'. Create a response for each request. During input
-  // processing if there is an error with any request that error will
-  // be sent immediately with the corresponding response (and the
-  // response unique_ptr will then be nullptr). The request object
-  // itself will not be released until after all inferencing is done
-  // (below) as we may need to access the request object when
-  // determine how to process outputs (for example, even if we don't
-  // need the outputs for a request that has an error, we do need to
-  // know the size of those outputs associated with the request so we
-  // can skip them in the output tensors).
-  std::vector<TRITONBACKEND_Response*> responses;
-  responses.reserve(request_count);
-
-  for (size_t i = 0; i < request_count; i++) {
-    TRITONBACKEND_Response* response;
-    auto err = TRITONBACKEND_ResponseNew(&response, requests[i]);
-    if (err == nullptr) {
-      responses.emplace_back(response);
-    } else {
-      responses.emplace_back(nullptr);
-      LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Fail to create response");
-      TRITONSERVER_ErrorDelete(err);
-    }
-  }
-
-  // Prepare I/O
-  std::vector<InputTensor> inputs;
-  if (!SetInputTensors(
-          total_batch_size, requests, request_count, &responses, inputs)) {
-    return;
-  }
-
-  std::vector<OutputTensor> outputs;
-  if (!SetOutputTensors(
-          total_batch_size, request_batch_sizes, requests, request_count,
-          &responses, outputs)) {
-    return;
-  }
-
-  std::vector<uint64_t> compute_input_end_ns(request_count);
-  std::vector<uint64_t> compute_output_start_ns(request_count);
-  RunModel(inputs, outputs, compute_input_end_ns, compute_output_start_ns);
-
-  uint64_t request_end_ns = request_start_ns;
-  SET_TIMESTAMP(request_end_ns);
-
-  // There are two types of statistics that we can report... the
-  // statistics for the entire batch of requests that we just executed
-  // and statistics for each individual request. Statistics for each
-  // individual request were reported above inside the loop as each
-  // request was processed (or for failed requests we report that
-  // failure below). Here we report statistics for the entire batch of
-  // requests.
-  LOG_IF_ERROR(
-      TRITONBACKEND_ModelInstanceReportBatchStatistics(
-          TritonModelInstance(), total_batch_size, request_start_ns,
-          compute_input_end_ns.front(), compute_output_start_ns.front(),
-          request_end_ns),
-      "failed reporting batch request statistics");
-
-  // We could have released each request as soon as we sent the
-  // corresponding response. But for clarity we just release them all
-  // here. Note that is something goes wrong when releasing a request
-  // all we can do is log it... there is no response left to use to
-  // report an error.
-  for (uint32_t r = 0; r < request_count; ++r) {
-    TRITONBACKEND_Request* request = requests[r];
-
-    // If we get to this point then there hasn't been any error and
-    // the response is complete and we can send it. This is the last
-    // (and only) response that we are sending for the request so we
-    // must mark it FINAL. If there is an error when sending all we
-    // can do is log it.
-    LOG_IF_ERROR(
-        TRITONBACKEND_ResponseSend(
-            responses[r], TRITONSERVER_RESPONSE_COMPLETE_FINAL,
-            nullptr /* success */),
-        "failed sending response");
-
-    // Report statistics for the successful request. For an instance
-    // using the CPU we don't associate any device with the
-    // statistics, otherwise we associate the instance's device.
-    LOG_IF_ERROR(
-        TRITONBACKEND_ModelInstanceReportStatistics(
-            TritonModelInstance(), request, true /* success */,
-            request_start_ns, compute_input_end_ns[r],
-            compute_output_start_ns[r], request_end_ns),
-        "failed reporting request statistics");
-
-    // Before releasing, record failed requests as those where
-    // responses[r] is nullptr. The timestamps are ignored in this
-    // case.
-    if (responses[r] == nullptr) {
-      LOG_IF_ERROR(
-          TRITONBACKEND_ModelInstanceReportStatistics(
-              TritonModelInstance(), request, false /* success */, 0, 0, 0, 0),
-          "failed reporting request statistics");
-    }
-
-    LOG_IF_ERROR(
-        TRITONBACKEND_RequestRelease(request, TRITONSERVER_REQUEST_RELEASE_ALL),
-        "failed releasing request");
-  }
-}
-
-bool
-LegionModelInstance::SetInputTensors(
-    const size_t total_batch_size, TRITONBACKEND_Request** requests,
-    const uint32_t request_count,
-    std::vector<TRITONBACKEND_Response*>* responses,
-    std::vector<InputTensor>& inputs)
-{
-  // [FIXME] more checking in terms of expected byte size and actual byte size
-  const int max_batch_size = Model()->MaxBatchSize();
-  size_t padding_batch_size =
-      (max_batch_size == 0) ? 0 : max_batch_size - total_batch_size;
-
-  // All requests must have equally-sized input tensors so use any
-  // request as the representative for the input tensors.
-  uint32_t input_count;
-  RESPOND_ALL_AND_RETURN_IF_ERROR(
-      false, responses, request_count,
-      TRITONBACKEND_RequestInputCount(requests[0], &input_count));
-  inputs.resize(input_count);
-  LegionTritonRuntime* runtime = model_state_->runtime_;
-  for (uint32_t input_idx = 0; input_idx < input_count; input_idx++) {
-    InputTensor& tensor = inputs[input_idx];
-    TRITONBACKEND_Input* input;
-    RESPOND_ALL_AND_RETURN_IF_ERROR(
-        false, responses, request_count,
-        TRITONBACKEND_RequestInputByIndex(requests[0], input_idx, &input));
-
-    const char* input_name;
-    TRITONSERVER_DataType input_datatype;
-    const int64_t* input_shape;
-    uint32_t input_dims_count;
-    RESPOND_ALL_AND_RETURN_IF_ERROR(
-        false, responses, request_count,
-        TRITONBACKEND_InputProperties(
-            input, &input_name, &input_datatype, &input_shape,
-            &input_dims_count, nullptr, nullptr));
-
-    tensor.name_ = input_name;
-    std::vector<int64_t> batchn_shape(
-        input_shape, input_shape + input_dims_count);
-
-    tensor.strides_ = std::vector<int64_t>(
-        input_dims_count, GetByteSize(input_datatype, {1}));
-    if (input_dims_count > 1) {
-      for (size_t i = input_dims_count - 1; i > 0; --i) {
-        tensor.strides_[i - 1] = tensor.strides_[i] * batchn_shape[i];
-      }
-    }
-
-    for (size_t request_idx = 0; request_idx < request_count; ++request_idx) {
-      TRITONBACKEND_Input* input;
-      RESPOND_ALL_AND_RETURN_IF_ERROR(
-          false, responses, request_count,
-          TRITONBACKEND_RequestInputByIndex(
-              requests[request_idx], input_idx, &input));
-
-      uint64_t total_buffer_byte_size;
-      uint32_t buffer_count;
-      RESPOND_ALL_AND_RETURN_IF_ERROR(
-          false, responses, request_count,
-          TRITONBACKEND_InputProperties(
-              input, nullptr, nullptr, nullptr, nullptr,
-              &total_buffer_byte_size, &buffer_count));
-
-      // Check if the input buffers need to be preprocessed into
-      // contiguous buffer that satisfies the constraints
-      bool need_preprocess = false;
-      std::vector<const void*> buffers;
-      std::vector<Memory> buffer_memories;
-      std::vector<std::pair<TRITONSERVER_MemoryType, int64_t>> buffer_locations;
-      // FIXME need to understand how a buffer satisfies the constraints,
-      // currently the request input must be in one contiguous buffer, and
-      // we shouldn't need to concatenate buffer for requests as splitting along
-      // batch dimension should be okay.
-      need_preprocess = (buffer_count > 1);
-      if (!need_preprocess) {
-        const void* buffer;
-        uint64_t buffer_byte_size;
-        TRITONSERVER_MemoryType memory_type;
-        int64_t memory_type_id;
-        RESPOND_ALL_AND_RETURN_IF_ERROR(
-            false, responses, request_count,
-            TRITONBACKEND_InputBuffer(
-                input, 0, &buffer, &buffer_byte_size, &memory_type,
-                &memory_type_id));
-        buffers.emplace_back(buffer);
-        buffer_locations.emplace_back(memory_type, memory_type_id);
-        buffer_memories.emplace_back(
-            runtime->FindMemory(memory_type, memory_type_id));
-      }
-      // for (size_t buffer_idx = 0; buffer_idx < buffer_count; ++buffer_count)
-      // {
-      //   const void* buffer;
-      //   uint64_t buffer_byte_size;
-      //   TRITONSERVER_MemoryType memory_type;
-      //   int64_t memory_type_id;
-      //   RESPOND_ALL_AND_RETURN_IF_ERROR(
-      //       false, responses, request_count,
-      //       TRITONBACKEND_InputBuffer(
-      //           input, buffer_idx, &buffer, &buffer_byte_size, &memory_type,
-      //           &memory_type_id));
-      //   // Check if the buffer is good
-      //   for (auto it = tensor.strides_.cbegin();
-      //        it != tensor.strides_.cend(); ++it) {
-      //     if (*it <= buffer_byte_size) {
-      //       need_preprocess = ((buffer_byte_size % *it) != 0);
-      //       break;
-      //     }
-      //   }
-      //   if (need_preprocess) {
-      //     break;
-      //   } else {
-      //     buffers.emplace_back(buffer);
-      //     buffer_locations.emplace_back(memory_type, memory_type_id);
-      //   }
-      // }
-      if (need_preprocess) {
-        // FIXME using CPU for now, can be smart based on what kind of input
-        // buffer that the model prefers
-        BackendMemory* backend_memory;
-        RESPOND_ALL_AND_RETURN_IF_ERROR(
-            false, responses, request_count,
-            BackendMemory::Create(
-                Model()->TritonMemoryManager(),
-                BackendMemory::AllocationType::CPU, 0, total_buffer_byte_size,
-                &backend_memory));
-        tensor.allocated_memory_.emplace_back(backend_memory);
-        RESPOND_ALL_AND_RETURN_IF_ERROR(
-            false, responses, request_count,
-            ReadInputTensor(
-                requests[request_idx], input_name, backend_memory->MemoryPtr(),
-                &total_buffer_byte_size));
-        tensor.buffers_.emplace_back(backend_memory->MemoryPtr());
-        tensor.buffer_locations_.emplace_back(
-            backend_memory->MemoryType(), backend_memory->MemoryTypeId());
-        tensor.buffer_memories_.emplace_back(runtime->FindMemory(
-            backend_memory->MemoryType(), backend_memory->MemoryTypeId()));
-      } else {
-        std::copy(
-            buffers.begin(), buffers.end(),
-            std::back_inserter(tensor.buffers_));
-        std::copy(
-            buffer_locations.begin(), buffer_locations.end(),
-            std::back_inserter(tensor.buffer_locations_));
-        std::copy(
-            buffer_memories.begin(), buffer_memories.end(),
-            std::back_inserter(tensor.buffer_memories_));
-      }
-    }
-
-    if (padding_batch_size != 0) {
-      size_t byte_size = tensor.strides_[0] * padding_batch_size;
-      BackendMemory* backend_memory;
-      RESPOND_ALL_AND_RETURN_IF_ERROR(
-          false, responses, request_count,
-          BackendMemory::Create(
-              Model()->TritonMemoryManager(),
-              BackendMemory::AllocationType::CPU, 0, byte_size,
-              &backend_memory));
-      tensor.allocated_memory_.emplace_back(backend_memory);
-      // set the value of the padding to zeros
-      memset(backend_memory->MemoryPtr(), 0, byte_size);
-      tensor.buffers_.emplace_back(backend_memory->MemoryPtr());
-      tensor.buffer_locations_.emplace_back(
-          backend_memory->MemoryType(), backend_memory->MemoryTypeId());
-    }
-  }
-  return true;
-}
-
-bool
-LegionModelInstance::SetOutputTensors(
-    const size_t total_batch_size,
-    const std::vector<size_t>& request_batch_sizes,
-    TRITONBACKEND_Request** requests, const uint32_t request_count,
-    std::vector<TRITONBACKEND_Response*>* responses,
-    std::vector<OutputTensor>& outputs)
-{
-  const int max_batch_size = Model()->MaxBatchSize();
-  size_t padding_batch_size =
-      (max_batch_size == 0) ? 0 : max_batch_size - total_batch_size;
-
-  const auto& output_infos = model_state_->OutputInfos();
-  outputs.reserve(output_infos.size());
-  LegionTritonRuntime* runtime = model_state_->runtime_;
-  for (const auto& output_info : output_infos) {
-    outputs.emplace_back();
-    OutputTensor& tensor = outputs.back();
-    tensor.name_ = std::get<0>(output_info);
-    const auto& triton_dtype = std::get<1>(output_info);
-    // Make a copy of it as the batch dimension will be updated to
-    // match individual request batch size.
-    std::vector<int64_t> batchn_shape = std::get<2>(output_info);
-
-    tensor.strides_ = std::vector<int64_t>(
-        batchn_shape.size(), GetByteSize(triton_dtype, {1}));
-    if (batchn_shape.size() > 1) {
-      for (size_t i = (batchn_shape.size() - 1); i > 0; --i) {
-        tensor.strides_[i - 1] = tensor.strides_[i] * batchn_shape[i];
-      }
-    }
-    size_t batch1_byte_size = GetByteSize(triton_dtype, batchn_shape);
-    if (max_batch_size != 0) {
-      batch1_byte_size /= batchn_shape[0];
-    }
-    // Prepare the output buffer for each response, if the output is not
-    // requested, backend managed buffer will be used
-    for (size_t request_idx = 0; request_idx < request_count; ++request_idx) {
-      uint32_t requested_output_count;
-      RESPOND_ALL_AND_RETURN_IF_ERROR(
-          false, responses, request_count,
-          TRITONBACKEND_RequestOutputCount(
-              requests[request_idx], &requested_output_count));
-      bool found = false;
-      for (size_t output_idx = 0; output_idx < requested_output_count;
-           ++output_idx) {
-        const char* output_name;
-        RESPOND_ALL_AND_RETURN_IF_ERROR(
-            false, responses, request_count,
-            TRITONBACKEND_RequestOutputName(
-                requests[request_idx], output_idx, &output_name));
-        if (tensor.name_ == output_name) {
-          found = true;
-          break;
-        }
-      }
-      if (found) {
-        if (max_batch_size != 0) {
-          batchn_shape[0] = request_batch_sizes[request_idx];
-        }
-        TRITONBACKEND_Output* response_output;
-        RESPOND_ALL_AND_RETURN_IF_ERROR(
-            false, responses, request_count,
-            TRITONBACKEND_ResponseOutput(
-                (*responses)[request_idx], &response_output,
-                tensor.name_.c_str(), triton_dtype, batchn_shape.data(),
-                batchn_shape.size()));
-        void* buffer;
-        // FIXME using CPU for now, can be smart based on what kind of output
-        // buffer that the model produce
-        TRITONSERVER_MemoryType memory_type = TRITONSERVER_MEMORY_CPU;
-        int64_t memory_type_id = 0;
-        RESPOND_ALL_AND_RETURN_IF_ERROR(
-            false, responses, request_count,
-            TRITONBACKEND_OutputBuffer(
-                response_output, &buffer,
-                batch1_byte_size * request_batch_sizes[request_idx],
-                &memory_type, &memory_type_id));
-        tensor.buffers_.emplace_back(buffer);
-        tensor.buffer_locations_.emplace_back(memory_type, memory_type_id);
-        tensor.buffer_memories_.emplace_back(
-            runtime->FindMemory(memory_type, memory_type_id));
-      } else {
-        BackendMemory* backend_memory;
-        RESPOND_ALL_AND_RETURN_IF_ERROR(
-            false, responses, request_count,
-            BackendMemory::Create(
-                Model()->TritonMemoryManager(),
-                BackendMemory::AllocationType::CPU, 0,
-                batch1_byte_size * request_batch_sizes[request_idx],
-                &backend_memory));
-        tensor.allocated_memory_.emplace_back(backend_memory);
-        tensor.buffers_.emplace_back(backend_memory->MemoryPtr());
-        tensor.buffer_locations_.emplace_back(
-            backend_memory->MemoryType(), backend_memory->MemoryTypeId());
-        tensor.buffer_memories_.emplace_back(runtime->FindMemory(
-            backend_memory->MemoryType(), backend_memory->MemoryTypeId()));
-      }
-    }
-    if (padding_batch_size != 0) {
-      BackendMemory* backend_memory;
-      RESPOND_ALL_AND_RETURN_IF_ERROR(
-          false, responses, request_count,
-          BackendMemory::Create(
-              Model()->TritonMemoryManager(),
-              BackendMemory::AllocationType::CPU, 0,
-              batch1_byte_size * padding_batch_size, &backend_memory));
-      tensor.allocated_memory_.emplace_back(backend_memory);
-      tensor.buffers_.emplace_back(backend_memory->MemoryPtr());
-      tensor.buffer_locations_.emplace_back(
-          backend_memory->MemoryType(), backend_memory->MemoryTypeId());
-      tensor.buffer_memories_.emplace_back(runtime->FindMemory(
-          backend_memory->MemoryType(), backend_memory->MemoryTypeId()));
-    }
-  }
-  return true;
-}
-
-IndexSpace
-LegionModelInstance::find_or_create_index_space(const Domain& domain)
-{
-  std::map<Domain, IndexSpace>::const_iterator finder =
-      top_level_index_spaces.find(domain);
-  if (finder != top_level_index_spaces.end())
-    return finder->second;
-  IndexSpace result = runtime_->create_index_space(context_, domain);
-  top_level_index_spaces[domain] = result;
-  return result;
-}
-
-IndexPartition
-LegionModelInstance::find_or_create_partition(
-    IndexSpace top_level_space, IndexSpace color_space,
-    const DomainTransform& part_transform, const Domain& part_extent,
-    PartitionKind kind)
-{
-  std::map<IndexSpace, std::vector<Partition>>::const_iterator finder =
-      top_level_partitions.find(top_level_space);
-  if (finder != top_level_partitions.end()) {
-    switch (part_extent.get_dim()) {
-#define DIMFUNC(DIM)                                                         \
-  case DIM: {                                                                \
-    Transform<DIM, DIM> transform = part_transform;                          \
-    Rect<DIM> extent = part_extent;                                          \
-    for (std::vector<Partition>::const_iterator it = finder->second.begin(); \
-         it != finder->second.end(); it++) {                                 \
-      if (color_space != it->color_space)                                    \
-        continue;                                                            \
-      Rect<DIM> prev_extent = it->extent;                                    \
-      if (extent != prev_extent)                                             \
-        continue;                                                            \
-      Transform<DIM, DIM> prev_transform = it->transform;                    \
-      bool isomorphic = true;                                                \
-      for (int d = 0; d < DIM; d++) {                                        \
-        if (transform[d] == prev_transform[d])                               \
-          continue;                                                          \
-        isomorphic = false;                                                  \
-        break;                                                               \
-      }                                                                      \
-      if (!isomorphic)                                                       \
-        continue;                                                            \
-      return it->partition;                                                  \
-    }                                                                        \
-  }
-      LEGION_FOREACH_N(DIMFUNC)
-#undef DIMFUNC
-      default:
-        assert(false);
-    }
-  }
-  // If we get here then we need to make it
-  IndexPartition result = runtime_->create_partition_by_restriction(
-      context_, top_level_space, color_space, part_transform, part_extent,
-      kind);
-  // Save it for later
-  top_level_partitions[top_level_space].push_back(
-      Partition(color_space, result, part_transform, part_extent));
-  return result;
-}
-
-FieldSpace
-LegionModelInstance::find_or_create_field_space(DataType data_type)
-{
-  std::map<DataType, FieldSpace>::const_iterator finder =
-      top_level_field_spaces.find(data_type);
-  if (finder != top_level_field_spaces.end())
-    return finder->second;
-  // make a new field space
-  FieldSpace result = runtime_->create_field_space(context_);
-  top_level_field_spaces[data_type] = result;
-  // Allocate a field of the right size in the field space
-  FieldAllocator allocator = runtime_->create_field_allocator(context_, result);
-  allocator.allocate_field(sizeof_datatype(data_type), FID_DATA);
-  return result;
-}
-
-LogicalRegion
-LegionModelInstance::create_tensor_region(Tensor* tensor)
-{
-  assert(!tensor->region[index_].exists());
-  DomainPoint lo, hi;
-  lo.dim = tensor->bounds.size();
-  hi.dim = tensor->bounds.size();
-  for (unsigned d = 0; d < tensor->bounds.size(); d++) {
-    lo[d] = 0;
-    assert(tensor->bounds[d] > 0);
-    hi[d] = tensor->bounds[d] - 1;  // legion domains are inclusive
-  }
-  Domain bounds(lo, hi);
-  IndexSpace is = find_or_create_index_space(bounds);
-  FieldSpace fs = find_or_create_field_space(tensor->type);
-  LogicalRegion lr = runtime_->create_logical_region(context_, is, fs);
-  // Save the handle in the tensor
-  tensor->region[index_] = lr;
-  // Remember the name for when we need to delete it
-  top_level_regions.push_back(lr);
-  return lr;
-}
-
-LogicalPartition
-LegionModelInstance::find_or_create_tiled_partition(
-    Tensor* tensor, const LayerStrategy* strategy)
-{
-  assert(tensor->region[index_].exists());
-  if (tensor->partition[index_].exists())
-    return tensor->partition[index_];
-  assert(size_t(strategy->nDims) == tensor->bounds.size());
-  Domain color_domain = strategy->get_launch_domain();
-  IndexSpace color_space = find_or_create_index_space(color_domain);
-  Domain part_extent;
-  DomainTransform part_transform;
-  switch (tensor->bounds.size()) {
-#define DIMFUNC(DIM)                                           \
-  case DIM: {                                                  \
-    Point<DIM> ext_hi;                                         \
-    Rect<DIM> color_rect = color_domain;                       \
-    for (int d = 0; d < DIM; d++) {                            \
-      size_t parts = color_rect.hi[d] - color_rect.lo[d] + 1;  \
-      ext_hi[d] = (tensor->bounds[d] + parts - 1) / parts - 1; \
-    }                                                          \
-    Rect<DIM> extent(Point<DIM>::ZEROES(), ext_hi);            \
-    Transform<DIM, DIM> transform;                             \
-    for (int i = 0; i < DIM; i++)                              \
-      for (int j = 0; j < DIM; j++)                            \
-        if (i == j)                                            \
-          transform[i][j] = extent.hi[i] - extent.lo[i] + 1;   \
-        else                                                   \
-          transform[i][j] = 0;                                 \
-    part_extent = extent;                                      \
-    part_transform = transform;                                \
-    break;                                                     \
-  }
-    LEGION_FOREACH_N(DIMFUNC)
-#undef DIMFUNC
-    default:
-      assert(false);
-  }
-  // Find or compute the index partition
-  LogicalRegion region = tensor->region[index_];
-  IndexPartition partition = find_or_create_partition(
-      region.get_index_space(), color_space, part_transform, part_extent,
-      LEGION_DISJOINT_COMPLETE_KIND);
-  LogicalPartition result = runtime_->get_logical_partition_by_tree(
-      context_, partition, region.get_field_space(), region.get_tree_id());
-  tensor->partition[index_] = result;
-  return result;
-}
-
-}}}  // namespace triton::backend::legion
diff --git a/triton/src/instance.h b/triton/src/instance.h
deleted file mode 100644
index 116c267260..0000000000
--- a/triton/src/instance.h
+++ /dev/null
@@ -1,180 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __LEGION_TRITON_INSTANCE_H__
-#define __LEGION_TRITON_INSTANCE_H__
-
-#include "legion.h"
-#include "model.h"
-#include "runtime.h"
-#include "strategy.h"
-#include "triton/backend/backend_input_collector.h"
-#include "triton/backend/backend_memory.h"
-#include "triton/backend/backend_model_instance.h"
-
-namespace triton { namespace backend { namespace legion {
-
-struct InputTensor {
-  std::string name_;
-  std::vector<const void*> buffers_;
-  std::vector<std::pair<TRITONSERVER_MemoryType, int64_t>> buffer_locations_;
-  std::vector<Realm::Memory> buffer_memories_;
-  std::vector<int64_t> strides_;
-  // A placeholder for the memory acquired to hold the preprocessed input buffer
-  std::vector<std::unique_ptr<BackendMemory>> allocated_memory_;
-};
-
-struct OutputTensor {
-  std::string name_;
-  std::vector<void*> buffers_;
-  std::vector<std::pair<TRITONSERVER_MemoryType, int64_t>> buffer_locations_;
-  std::vector<Realm::Memory> buffer_memories_;
-  std::vector<int64_t> strides_;
-  // A placeholder for the memory acquired to hold the part of the output
-  // that is not requested
-  std::vector<std::unique_ptr<BackendMemory>> allocated_memory_;
-};
-
-//
-// LegionModelInstance
-//
-// Each instantiation of this class represents a backend instance
-// for running inference requests on a set of resources. It will
-// have an associated Legion implicit task along with a trace for
-// replaying inference jobs.
-//
-//
-class LegionModelInstance : public BackendModelInstance {
- public:
-  static TRITONSERVER_Error* Create(
-      TRITONBACKEND_ModelInstance* triton_model_instance,
-      LegionModelState* model_state, LegionModelInstance** state);
-
-  ~LegionModelInstance();
-
-  void CreateContext(
-      Legion::Runtime* runtime, Legion::TaskID tid, unsigned rank,
-      size_t total_ranks, Realm::Event precondition, bool owner_instance);
-
-  Realm::Barrier GetExecutionBarrier(
-      size_t ranks, Realm::Event& precondition, bool external,
-      bool need_lock = true);
-
-  // Execute...
-  void ProcessRequests(
-      TRITONBACKEND_Request** requests, const uint32_t request_count);
-
-  void RunModel(
-      const std::vector<InputTensor>& inputs,
-      const std::vector<OutputTensor>& outputs,
-      std::vector<uint64_t>& compute_input_end,
-      std::vector<uint64_t>& compute_output_start, bool distributed = false);
-
- private:
-  inline void Bind() const
-  {
-    runtime_->bind_implicit_task_to_external_thread(context_);
-  }
-  inline void Unbind() const
-  {
-    runtime_->unbind_implicit_task_from_external_thread(context_);
-  }
-
-  // Small helper class to make sure we always unbind even under errors
-  class AutoBind {
-   public:
-    AutoBind(LegionModelInstance* state) : instance_state(state)
-    {
-      state->Bind();
-    }
-    ~AutoBind() { instance_state->Unbind(); }
-
-   private:
-    LegionModelInstance* const instance_state;
-  };
-
-  // Set the input tensors for running the model, in case of error, responses
-  // will be returned with error and the function will return false.
-  // Returns true on success.
-  bool SetInputTensors(
-      const size_t total_batch_size, TRITONBACKEND_Request** requests,
-      const uint32_t request_count,
-      std::vector<TRITONBACKEND_Response*>* responses,
-      std::vector<InputTensor>& inputs);
-
-  bool SetOutputTensors(
-      const size_t total_batch_size,
-      const std::vector<size_t>& request_batch_sizes,
-      TRITONBACKEND_Request** requests, const uint32_t request_count,
-      std::vector<TRITONBACKEND_Response*>* responses,
-      std::vector<OutputTensor>& outputs);
-
-  LegionModelInstance(
-      TRITONBACKEND_ModelInstance* triton_model_instance,
-      LegionModelState* model_state, unsigned index, Realm::Event ready);
-
- public:
-  // methods for support of operators
-  // should only be invoked inside the implicit top-level legion tasks
-  Legion::IndexSpace find_or_create_index_space(const Legion::Domain& domain);
-  Legion::IndexPartition find_or_create_partition(
-      Legion::IndexSpace top_level_space, Legion::IndexSpace color_space,
-      const Legion::DomainTransform& transform, const Legion::Domain& extent,
-      Legion::PartitionKind kind);
-  Legion::FieldSpace find_or_create_field_space(DataType date_type);
-  Legion::LogicalRegion create_tensor_region(Tensor* tensor);
-  Legion::LogicalPartition find_or_create_tiled_partition(
-      Tensor* tensor, const LayerStrategy* strategy);
-
- public:
-  Legion::Runtime* const runtime_;
-  LegionModelState* const model_state_;
-  const unsigned index_;
-  const Realm::Event context_ready_;
-
- private:
-  Legion::Context context_;
-  Legion::MapperID mapper_;
-
- private:
-  Realm::FastReservation lock_;
-  Realm::Barrier execution_barrier_;
-
- private:
-  std::map<Legion::Domain, Legion::IndexSpace> top_level_index_spaces;
-  struct Partition {
-   public:
-    Partition(void) {}
-    Partition(
-        Legion::IndexSpace cs, Legion::IndexPartition p,
-        const Legion::DomainTransform& t, const Legion::Domain& e)
-        : color_space(cs), partition(p), transform(t), extent(e)
-    {
-    }
-
-   public:
-    Legion::IndexSpace color_space;
-    Legion::IndexPartition partition;
-    Legion::DomainTransform transform;
-    Legion::Domain extent;
-  };
-  std::map<Legion::IndexSpace, std::vector<Partition>> top_level_partitions;
-  std::map<DataType, Legion::FieldSpace> top_level_field_spaces;
-  std::vector<Legion::LogicalRegion> top_level_regions;
-};
-
-}}}  // namespace triton::backend::legion
-
-#endif  // __LEGION_TRITON_INSTANCE_H__
diff --git a/triton/src/libtriton_legion.ldscript b/triton/src/libtriton_legion.ldscript
deleted file mode 100644
index 0b1c2b695b..0000000000
--- a/triton/src/libtriton_legion.ldscript
+++ /dev/null
@@ -1,23 +0,0 @@
-#------------------------------------------------------------------------------#
-# Copyright 2022 NVIDIA CORPORATION
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#------------------------------------------------------------------------------#
-{
-  global:
-    TRITONBACKEND_*;
-    extern "C++" {
-        triton::backend::legion::*;
-    };
-  local: *;
-};
diff --git a/triton/src/model.cc b/triton/src/model.cc
deleted file mode 100644
index a61b207bdd..0000000000
--- a/triton/src/model.cc
+++ /dev/null
@@ -1,555 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "model.h"
-#include "common.h"
-#include "instance.h"
-#include "onnx_parser.h"
-#include "operator.h"
-#include "tensor.h"
-
-using namespace Legion;
-
-namespace triton { namespace backend { namespace legion {
-
-TRITONSERVER_Error*
-LegionModelState::Create(
-    TRITONBACKEND_Model* triton_model, const std::string& name,
-    uint64_t version, LegionTritonRuntime* runtime, LegionModelState** state)
-{
-  std::unique_ptr<LegionModelState> lstate;
-  try {
-    lstate.reset(new LegionModelState(triton_model, runtime, name, version));
-  }
-  catch (const BackendModelException& ex) {
-    RETURN_ERROR_IF_TRUE(
-        ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL,
-        std::string("unexpected nullptr in BackendModelException"));
-    RETURN_IF_ERROR(ex.err_);
-  }
-
-  // Load the model first to obtain the ground truth for processing model config
-  RETURN_IF_ERROR(lstate->LoadModel());
-
-  // Auto-complete the configuration if requested...
-  bool auto_complete_config = false;
-  RETURN_IF_ERROR(TRITONBACKEND_ModelAutoCompleteConfig(
-      triton_model, &auto_complete_config));
-  if (auto_complete_config) {
-    RETURN_IF_ERROR(lstate->AutoCompleteConfig());
-
-    triton::common::TritonJson::WriteBuffer json_buffer;
-    lstate->ModelConfig().Write(&json_buffer);
-
-    TRITONSERVER_Message* message;
-    RETURN_IF_ERROR(TRITONSERVER_MessageNewFromSerializedJson(
-        &message, json_buffer.Base(), json_buffer.Size()));
-    RETURN_IF_ERROR(TRITONBACKEND_ModelSetConfig(
-        triton_model, 1 /* config_version */, message));
-  }
-  RETURN_IF_ERROR(lstate->ValidateModelConfig());
-  *state = lstate.release();
-  runtime->RecordModel(*state);
-  return nullptr;  // success
-}
-
-LegionModelState::~LegionModelState(void)
-{
-  FreeLayers();
-  for (auto& input : inputs_) delete input.second;
-  if (strategy_)
-    delete strategy_;
-  runtime_->RemoveModel(this);
-}
-
-TRITONSERVER_Error*
-LegionModelState::LoadModel()
-{
-  // TODO: load files based on the default / cc file name that may be set
-  // in model config
-  auto model_path = JoinPath({RepositoryPath(), std::to_string(Version())});
-  assert(strategy_ == nullptr);
-  strategy_ = PartitionStrategy::LoadStrategy(
-      JoinPath({model_path, "model.strategy"}), this);
-
-  // load the ONNX model description as a list of layers
-  // with tensor dependences between then and put them in layers_
-  RETURN_IF_ERROR(OnnxParser::LoadModel(
-      [this](
-          Realm::Processor::Kind kind) -> const std::vector<Realm::Processor>& {
-        return runtime_->FindLocalProcessors(kind);
-      },
-      this, strategy_, JoinPath({model_path, "model.onnx"}), &inputs_,
-      &outputs_, &layers_));
-  RETURN_IF_ERROR(SetOutputInfos());
-
-  // Should have the same number of layers in both cases
-  assert(strategy_->layers.size() == layers_.size());
-
-  // Perform the layer fusion optimization based on the partitioning strategy
-  FuseLayers();
-
-  // Load each of the layers across the target processors
-  LoadLayers();
-
-  return nullptr;
-}
-
-unsigned
-LegionModelState::ReserveInstance(void)
-{
-  AutoLock<true> lock(lock_);
-  unsigned result = instances_.size();
-  instances_.resize(result + 1, nullptr);
-  return result;
-}
-
-void
-LegionModelState::RecordInstance(LegionModelInstance* instance)
-{
-  assert(instance->model_state_ == this);
-  AutoLock<true> lock(lock_, false /*exclusive*/);
-  assert(instance->index_ < instances_.size());
-  assert(instances_[instance->index_] == nullptr);
-  instances_[instance->index_] = instance;
-}
-
-void
-LegionModelState::initialize(
-    LegionModelInstance* instance, const unsigned instance_index,
-    Runtime* runtime, Context ctx, MapperID mapper)
-{
-  // First create logical regions for all the input tensors
-  for (auto& input : inputs_) instance->create_tensor_region(input.second);
-
-  for (auto layer : layers_)
-    layer->initialize(instance, instance_index, runtime, ctx, mapper);
-}
-
-void
-LegionModelState::forward(
-    LegionModelInstance* instance, const unsigned instance_index,
-    Runtime* runtime, Context ctx, MapperID mapper,
-    const std::vector<InputTensor>& inputs,
-    const std::vector<OutputTensor>& outputs,
-    std::vector<uint64_t>& compute_input_end_ns,
-    std::vector<uint64_t>& compute_output_start_ns)
-{
-  assert(inputs.size() == inputs_.size());
-  assert(outputs.size() == outputs_.size());
-  // Attach the external memory allocations to the logical regions for the
-  // tensors
-  const std::vector<FieldID> fields(1, FID_DATA);
-  std::vector<PhysicalRegion> input_regions(inputs.size());
-  for (unsigned idx = 0; idx < inputs.size(); idx++) {
-    const InputTensor& input = inputs[idx];
-    assert(input.buffers_.size() == 1);
-    assert(input.buffer_locations_.size() == 1);
-    assert(input.buffer_memories_.size() == 1);
-    assert(input.strides_.size() == inputs_[idx].second->bounds.size());
-    LogicalRegion region = inputs_[idx].second->region[instance_index];
-    AttachLauncher launcher(
-        LEGION_EXTERNAL_INSTANCE, region, region, false /*restricted*/,
-        false /*mapped*/);
-    launcher.attach_array_soa(
-        const_cast<void*>(input.buffers_[0]), false /*not column major*/,
-        fields, input.buffer_memories_[0]);
-    input_regions[idx] = runtime->attach_external_resource(ctx, launcher);
-  }
-  std::vector<PhysicalRegion> output_regions(outputs.size());
-  for (unsigned idx = 0; idx < outputs.size(); idx++) {
-    const OutputTensor& output = outputs[idx];
-    assert(output.buffers_.size() == 1);
-    assert(output.buffer_locations_.size() == 1);
-    assert(output.buffer_memories_.size() == 1);
-    assert(output.strides_.size() == outputs_[idx].second->bounds.size());
-    LogicalRegion region = outputs_[idx].second->region[instance_index];
-    AttachLauncher launcher(
-        LEGION_EXTERNAL_INSTANCE, region, region, false /*restricted*/,
-        false /*mapped*/);
-    launcher.attach_array_soa(
-        output.buffers_[0], false /*not column major*/, fields,
-        output.buffer_memories_[0]);
-    output_regions[idx] = runtime->attach_external_resource(ctx, launcher);
-  }
-  // Execution fence for timing operation
-  runtime->issue_execution_fence(ctx);
-  TimingLauncher timing_launcher(LEGION_MEASURE_NANO_SECONDS);
-  Future start = runtime->issue_timing_measurement(ctx, timing_launcher);
-
-  // We can trace the execution of this model since it should be the same
-  runtime->begin_trace(ctx, 0 /*only ever have one trace*/);
-  for (auto layer : layers_)
-    layer->forward(instance, instance_index, runtime, ctx, mapper);
-  runtime->end_trace(ctx, 0 /*only ever have one trace*/);
-
-  // Execution fence for timing operation
-  runtime->issue_execution_fence(ctx);
-  Future stop = runtime->issue_timing_measurement(ctx, timing_launcher);
-  // Detach the external memory allocations
-  for (unsigned idx = 0; idx < input_regions.size(); idx++)
-    runtime->detach_external_resource(ctx, input_regions[idx], false /*flush*/);
-  for (unsigned idx = 0; idx < output_regions.size(); idx++)
-    runtime->detach_external_resource(ctx, output_regions[idx], true /*flush*/);
-
-  const uint64_t start_time = start.get_result<long long>();
-  for (unsigned idx = 0; idx < compute_input_end_ns.size(); idx++)
-    compute_input_end_ns[idx] = start_time;
-
-  const uint64_t stop_time = stop.get_result<long long>();
-  for (unsigned idx = 0; idx < compute_output_start_ns.size(); idx++)
-    compute_output_start_ns[idx] = stop_time;
-
-  // Wait for everything to be done before we return
-  Future done = runtime->issue_execution_fence(ctx);
-  done.wait();
-}
-
-void
-LegionModelState::finalize(
-    LegionModelInstance* instance, const unsigned instance_index,
-    Runtime* runtime, Context ctx, MapperID mapper)
-{
-  for (auto layer : layers_)
-    layer->finalize(instance, instance_index, runtime, ctx, mapper);
-}
-
-LegionModelInstance*
-LegionModelState::FindInstance(
-    unsigned instance_index, bool external, bool need_lock)
-{
-  if (need_lock) {
-    if (external) {
-      AutoLock<true> lock(lock_, false /*exclusive*/);
-      return FindInstance(instance_index, true, false);
-    } else {
-      AutoLock<false> lock(lock_, false /*exclusive*/);
-      return FindInstance(instance_index, false, false);
-    }
-  }
-  assert(instance_index < instances_.size());
-  return instances_[instance_index];
-}
-
-const PartitionStrategy*
-LegionModelState::GetStrategy(void) const
-{
-  assert(strategy_ != nullptr);
-  return strategy_;
-}
-
-TRITONSERVER_Error*
-LegionModelState::AutoCompleteConfig()
-{
-  // FIXME: Check with the FFModel
-  return nullptr;  // success
-}
-
-TRITONSERVER_Error*
-LegionModelState::ValidateModelConfig()
-{
-  // Constraints that apply to models in general
-  {
-    triton::common::TritonJson::Value igs;
-    RETURN_IF_ERROR(ModelConfig().MemberAsArray("instance_group", &igs));
-    for (size_t i = 0; i < igs.ArraySize(); i++) {
-      triton::common::TritonJson::Value ig;
-      RETURN_IF_ERROR(igs.IndexAsObject(i, &ig));
-      std::string kind_str;
-      RETURN_IF_ERROR(ig.MemberAsString("kind", &kind_str));
-      if (kind_str != "KIND_MODEL") {
-        return TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_INVALID_ARG,
-            (std::string(
-                 "unexpected instance group kind '" + kind_str +
-                 "' for model '" + Name() +
-                 "', expecting 'KIND_MODEL' to use model specified device "
-                 "placement")
-                 .c_str()));
-      }
-    }
-
-    // [issue #4] currently not support batching
-    if (max_batch_size_ != 0) {
-      return TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_INVALID_ARG,
-          (std::string("'max_batch_size' must be 0 in model configuration as "
-                       "batching is not currently supported")
-               .c_str()));
-    }
-
-    // FIXME add check for other model config fields that not yet supported
-  }
-
-  {
-    // Build a map from name to tensors of the model for easy lookup
-    std::map<std::string, Tensor*> tensors;
-    for (const auto& io : inputs_) {
-      tensors.emplace(io.first, io.second);
-    }
-
-    triton::common::TritonJson::Value ios;
-    RETURN_IF_ERROR(ModelConfig().MemberAsArray("input", &ios));
-
-    if (ios.ArraySize() != tensors.size()) {
-      return TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_INVALID_ARG,
-          (std::string(
-               "configuration for model '" + Name() + "' specifies " +
-               std::to_string(ios.ArraySize()) + " inputs, the model has " +
-               std::to_string(tensors.size()))
-               .c_str()));
-    }
-
-    for (size_t i = 0; i < ios.ArraySize(); i++) {
-      triton::common::TritonJson::Value io;
-      RETURN_IF_ERROR(ios.IndexAsObject(i, &io));
-      std::string io_name;
-      RETURN_IF_ERROR(io.MemberAsString("name", &io_name));
-
-      // Check datatypes
-      std::string io_dtype;
-      RETURN_IF_ERROR(io.MemberAsString("data_type", &io_dtype));
-      RETURN_ERROR_IF_TRUE(
-          (io_dtype == "TYPE_STRING"), TRITONSERVER_ERROR_INVALID_ARG,
-          std::string("unsupported datatype '") + io_dtype + "' for tensor '" +
-              io_name + "' for model '" + Name() + "'");
-      // If a reshape is provided for the input then use that when
-      // validating that the model matches what is expected.
-      std::vector<int64_t> dims;
-      triton::common::TritonJson::Value reshape;
-      if (io.Find("reshape", &reshape)) {
-        RETURN_IF_ERROR(ParseShape(reshape, "shape", &dims));
-      } else {
-        RETURN_IF_ERROR(ParseShape(io, "dims", &dims));
-      }
-      for (const auto dim : dims) {
-        RETURN_ERROR_IF_TRUE(
-            (dim == WILDCARD_DIM), TRITONSERVER_ERROR_INVALID_ARG,
-            std::string(
-                "dynamic tensor is not supported for model '" + Name() + "'"));
-      }
-
-      // Check the properties against the corresponding tensor
-      auto it = tensors.find(io_name);
-      if (it == tensors.end()) {
-        return TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_INVALID_ARG,
-            (std::string(
-                 "configuration for model '" + Name() + "' specifies tensor '" +
-                 io_name + "' which is not found in the model")
-                 .c_str()));
-      }
-      const auto& tensor = it->second;
-      if (ToDataType(ModelConfigDataTypeToTritonServerDataType(io_dtype)) !=
-          tensor->type) {
-        return TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_INVALID_ARG,
-            (std::string(
-                 "configuration for model '" + Name() + "' specifies tensor '" +
-                 io_name + "' with type '" + io_dtype +
-                 "', the tensor in the model has type '" +
-                 DataTypeString(tensor->type) + "'")
-                 .c_str()));
-      } else if (tensor->type == DT_NONE) {
-        return TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_INVALID_ARG,
-            (std::string(
-                 "tensor '" + io_name + "' in the model '" + Name() +
-                 "' has unknown type")
-                 .c_str()));
-      }
-      if (max_batch_size_ != 0) {
-        dims.insert(dims.begin(), max_batch_size_);
-      }
-      // put tensor's bound in int64_t to utilize backend common utilities
-      std::vector<int64_t> tensor_bounds;
-      for (const auto bound : tensor->bounds) {
-        tensor_bounds.emplace_back(bound);
-      }
-      if (dims != tensor_bounds) {
-        return TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_INVALID_ARG,
-            (std::string(
-                 "configuration for model '" + Name() + "' specifies tensor '" +
-                 io_name + "' with full shape " + ShapeToString(dims) +
-                 ", the tensor in the model has shape " +
-                 ShapeToString(tensor_bounds))
-                 .c_str()));
-      }
-    }
-  }
-
-  // Outputs
-  {
-    // Build a map from name to tensors of the model for easy lookup
-    std::map<std::string, Tensor*> tensors;
-    for (const auto& io : outputs_) {
-      tensors.emplace(io.first, io.second);
-    }
-
-    triton::common::TritonJson::Value ios;
-    RETURN_IF_ERROR(ModelConfig().MemberAsArray("output", &ios));
-
-    // Model config may expose a subset of the outputs
-    if (ios.ArraySize() > tensors.size()) {
-      return TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_INVALID_ARG,
-          (std::string(
-               "configuration for model '" + Name() + "' specifies " +
-               std::to_string(ios.ArraySize()) + " outputs, the model has " +
-               std::to_string(tensors.size()))
-               .c_str()));
-    }
-
-    for (size_t i = 0; i < ios.ArraySize(); i++) {
-      triton::common::TritonJson::Value io;
-      RETURN_IF_ERROR(ios.IndexAsObject(i, &io));
-      std::string io_name;
-      RETURN_IF_ERROR(io.MemberAsString("name", &io_name));
-      // Check datatypes
-      std::string io_dtype;
-      RETURN_IF_ERROR(io.MemberAsString("data_type", &io_dtype));
-      RETURN_ERROR_IF_TRUE(
-          (io_dtype == "TYPE_STRING"), TRITONSERVER_ERROR_INVALID_ARG,
-          std::string("unsupported datatype '") + io_dtype + "' for tensor '" +
-              io_name + "' for model '" + Name() + "'");
-      // If a reshape is provided for the input then use that when
-      // validating that the model matches what is expected.
-      std::vector<int64_t> dims;
-      triton::common::TritonJson::Value reshape;
-      if (io.Find("reshape", &reshape)) {
-        RETURN_IF_ERROR(ParseShape(reshape, "shape", &dims));
-      } else {
-        RETURN_IF_ERROR(ParseShape(io, "dims", &dims));
-      }
-      for (const auto dim : dims) {
-        RETURN_ERROR_IF_TRUE(
-            (dim == WILDCARD_DIM), TRITONSERVER_ERROR_INVALID_ARG,
-            std::string(
-                "dynamic tensor is not supported for model '" + Name() + "'"));
-      }
-
-      // Check the properties against the corresponding tensor
-      auto it = tensors.find(io_name);
-      if (it == tensors.end()) {
-        return TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_INVALID_ARG,
-            (std::string(
-                 "configuration for model '" + Name() + "' specifies tensor '" +
-                 io_name + "' which is not found in the model")
-                 .c_str()));
-      }
-      const auto& tensor = it->second;
-      if (ToDataType(ModelConfigDataTypeToTritonServerDataType(io_dtype)) !=
-          tensor->type) {
-        return TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_INVALID_ARG,
-            (std::string(
-                 "configuration for model '" + Name() + "' specifies tensor '" +
-                 io_name + "' with type '" + io_dtype +
-                 "', the tensor in the model has type '" +
-                 DataTypeString(tensor->type) + "'")
-                 .c_str()));
-      } else if (tensor->type == DT_NONE) {
-        return TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_INVALID_ARG,
-            (std::string(
-                 "tensor '" + io_name + "' in the model '" + Name() +
-                 "' has unknown type")
-                 .c_str()));
-      }
-      if (max_batch_size_ != 0) {
-        dims.insert(dims.begin(), max_batch_size_);
-      }
-      // put tensor's bound in int64_t to utilize backend common utilities
-      std::vector<int64_t> tensor_bounds;
-      for (const auto bound : tensor->bounds) {
-        tensor_bounds.emplace_back(bound);
-      }
-      if (dims != tensor_bounds) {
-        return TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_INVALID_ARG,
-            (std::string(
-                 "configuration for model '" + Name() + "' specifies tensor '" +
-                 io_name + "' with full shape " + ShapeToString(dims) +
-                 ", the tensor in the model has shape " +
-                 ShapeToString(tensor_bounds))
-                 .c_str()));
-      }
-    }
-  }
-  return nullptr;  // success
-}
-
-TRITONSERVER_Error*
-LegionModelState::SetOutputInfos()
-{
-  for (const auto& output : outputs_) {
-    std::vector<int64_t> tensor_bounds;
-    for (const auto bound : output.second->bounds) {
-      tensor_bounds.emplace_back(bound);
-    }
-    auto triton_dtype = ToTritonDataType(output.second->type);
-    output_infos_.emplace_back(output.first, triton_dtype, tensor_bounds);
-  }
-  return nullptr;  // success
-}
-
-void
-LegionModelState::LoadLayers(void) const
-{
-  std::vector<Realm::Event> loaded_events;
-  for (unsigned idx1 = 0; idx1 < layers_.size(); idx1++) {
-    Operator* op = layers_[idx1];
-    const LayerStrategy* config = strategy_->layers[idx1];
-    for (unsigned idx2 = 0; idx2 < config->nProcs; idx2++) {
-      Realm::Processor proc = config->local_processors[idx2];
-      loaded_events.push_back(runtime_->LoadLayer(proc, op));
-    }
-  }
-  const Realm::Event wait_on = Realm::Event::merge_events(loaded_events);
-  if (wait_on.exists() && !wait_on.has_triggered())
-    wait_on.external_wait();
-}
-
-void
-LegionModelState::FuseLayers(void)
-{
-  // FIXME: add support for layer fusion
-}
-
-void
-LegionModelState::FreeLayers(void) const
-{
-  std::vector<Realm::Event> freed_events;
-  for (unsigned idx1 = 0; idx1 < layers_.size(); idx1++) {
-    Operator* op = layers_[idx1];
-    const LayerStrategy* config = strategy_->layers[idx1];
-    for (unsigned idx2 = 0; idx2 < config->nProcs; idx2++) {
-      Realm::Processor proc = config->local_processors[idx2];
-      freed_events.push_back(runtime_->FreeLayer(proc, op));
-    }
-  }
-  const Realm::Event wait_on = Realm::Event::merge_events(freed_events);
-  if (wait_on.exists() && !wait_on.has_triggered())
-    wait_on.external_wait();
-  // Delete layers back to front
-  for (std::vector<Operator*>::const_reverse_iterator it = layers_.rbegin();
-       it != layers_.rend(); it++)
-    delete (*it);
-}
-
-}}}  // namespace triton::backend::legion
diff --git a/triton/src/model.h b/triton/src/model.h
deleted file mode 100644
index 06f7bc0263..0000000000
--- a/triton/src/model.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __LEGION_TRITON_MODEL_H__
-#define __LEGION_TRITON_MODEL_H__
-
-#include "legion.h"
-#include "triton/backend/backend_common.h"
-#include "triton/backend/backend_model.h"
-#include "types.h"
-
-namespace triton { namespace backend { namespace legion {
-
-//
-// LegionModelState
-//
-// Capture the meta data needed for representing a model
-//
-class LegionModelState : public BackendModel {
- public:
-  static TRITONSERVER_Error* Create(
-      TRITONBACKEND_Model* triton_model, const std::string& name,
-      uint64_t version, LegionTritonRuntime* runtime, LegionModelState** state);
-  virtual ~LegionModelState();
-
-
-  unsigned ReserveInstance(void);
-  void RecordInstance(LegionModelInstance* instance);
-
-  LegionModelInstance* FindInstance(
-      unsigned instance_index, bool external, bool need_lock = true);
-  const PartitionStrategy* GetStrategy(void) const;
-
-  // These methods must all be called while the instance is bound
-  // to the its implicit top-level task context
-  void initialize(
-      LegionModelInstance* instance, const unsigned instance_index,
-      Legion::Runtime* runtime, Legion::Context ctx, Legion::MapperID mapper);
-  void forward(
-      LegionModelInstance* instance, const unsigned instance_index,
-      Legion::Runtime* runtime, Legion::Context ctx, Legion::MapperID mapper,
-      const std::vector<InputTensor>& inputs,
-      const std::vector<OutputTensor>& outputs,
-      std::vector<uint64_t>& compute_input_end_ns,
-      std::vector<uint64_t>& compute_output_end_ns);
-  void finalize(
-      LegionModelInstance* instance, const unsigned instance_index,
-      Legion::Runtime* runtime, Legion::Context ctx, Legion::MapperID mapper);
-  const std::vector<
-      std::tuple<std::string, TRITONSERVER_DataType, std::vector<int64_t>>>&
-  OutputInfos()
-  {
-    return output_infos_;
-  }
-
- private:
-  LegionModelState(
-      TRITONBACKEND_Model* triton_model, LegionTritonRuntime* runtime,
-      const std::string& n, uint64_t v)
-      : BackendModel(triton_model), runtime_(runtime), name(n), version(v),
-        strategy_(nullptr)
-  {
-  }
-
-  TRITONSERVER_Error* LoadModel();
-  TRITONSERVER_Error* AutoCompleteConfig();
-  TRITONSERVER_Error* ValidateModelConfig();
-  TRITONSERVER_Error* SetOutputInfos();
-
-  void LoadLayers(void) const;
-  void FuseLayers(void);
-  void FreeLayers(void) const;
-
- public:
-  LegionTritonRuntime* const runtime_;
-  const std::string name;
-  const uint64_t version;
-
- private:
-  Realm::FastReservation lock_;
-  std::vector<std::pair<std::string, Tensor*>> inputs_;  // We own these tensors
-  std::vector<std::pair<std::string, Tensor*>>
-      outputs_;  // We do NOT own these tensors
-  std::vector<Operator*> layers_;
-  PartitionStrategy* strategy_;
-  std::vector<LegionModelInstance*> instances_;
-  // Output information parsed from 'outputs_' for easier access,
-  // use to interact with Triton APIs.
-  // FIXME calculate stride once for all
-  std::vector<
-      std::tuple<std::string, TRITONSERVER_DataType, std::vector<int64_t>>>
-      output_infos_;
-};
-
-}}}  // namespace triton::backend::legion
-
-#endif  // __LEGION_TRITON_MODEL_H__
diff --git a/triton/src/onnx/onnx-data.proto b/triton/src/onnx/onnx-data.proto
deleted file mode 100644
index 2829c2cee8..0000000000
--- a/triton/src/onnx/onnx-data.proto
+++ /dev/null
@@ -1,105 +0,0 @@
-//
-// WARNING: This file is automatically generated!  Please edit onnx.in.proto.
-//
-
-
-// SPDX-License-Identifier: Apache-2.0
-
-
-syntax = "proto2";
-
-package onnx;
-import "onnx/onnx-ml.proto";
-
-// This file contains the proto definitions for MapProto and
-// SequenceProto. These protos are used to represent the data structures
-// of maps and sequence for use in test data or ModelProto.
-
-// Sequences
-//
-// Defines a dense, ordered, collection of elements that are of homogeneous types.
-// Sequences can be made out of tensors, maps, or sequences.
-//
-// If a sequence is made out of tensors, the tensors must have the same element
-// type (i.e. int32). In some cases, the tensors in a sequence can have different
-// shapes.  Whether the tensors can have different shapes or not depends on the
-// type/shape associated with the corresponding "ValueInfo". For example,
-// "Sequence<Tensor<float, [M,N]>" means that all tensors have same shape. However,
-// "Sequence<Tensor<float, [omitted,omitted]>" means they can have different
-// shapes (all of rank 2), where "omitted" means the corresponding dimension has
-// no symbolic/constant value. Finally, "Sequence<Tensor<float, omitted>>" means
-// that the different tensors can have different ranks, when the "shape" itself
-// is omitted from the tensor-type. For a more complete description, refer to
-// https://github.com/onnx/onnx/blob/master/docs/IR.md#static-tensor-shapes.
-//
-message SequenceProto {
-
-  optional string name = 1;
-
-  enum DataType {
-    UNDEFINED = 0;
-    TENSOR = 1;
-    SPARSE_TENSOR = 2;
-    SEQUENCE = 3;
-    MAP = 4;
-  }
-
-  // The data type of the element.
-  // This field MUST have a valid SequenceProto.DataType value
-  optional int32 elem_type = 2;
-
-  // For TensorProto values.
-  // When this field is present, the elem_type field MUST be TENSOR.
-  repeated TensorProto tensor_values = 3;
-
-  // For SparseTensorProto values.
-  // When this field is present, the elem_type field MUST be SPARSE_TENSOR.
-  repeated SparseTensorProto sparse_tensor_values = 4;
-
-  // For SequenceProto values, allowing sequences to be of themselves.
-  // When this field is present, the elem_type field MUST be SEQUENCE.
-  repeated SequenceProto sequence_values = 5;
-
-  // For MapProto values.
-  // When this field is present, the elem_type field MUST be MAP.
-  repeated MapProto map_values = 6;
-
-}
-
-
-// Maps
-//
-// Specifies an associative table, defined by keys and values.
-// MapProto is formed with a repeated field of keys (of type INT8, INT16, INT32,
-// INT64, UINT8, UINT16, UINT32, UINT64, or STRING) and values (of type TENSOR,
-// SPARSE_TENSOR, SEQUENCE, or MAP). Key types and value types have to remain
-// the same throughout the instantiation of the MapProto.
-//
-message MapProto {
-
-  optional string name = 1;
-
-  // All MapProto data types must have the same length of keys and values.
-
-  // The data type of the key.
-  // This field MUST have a valid TensorProto.DataType value of
-  // INT8, INT16, INT32, INT64, UINT8, UINT16, UINT32, UINT64, or STRING
-  optional int32 key_type = 2;
-
-  // Every element of keys has to be one of the following data types
-  // INT8, INT16, INT32, INT64, UINT8, UINT16, UINT32, UINT64, or STRING.
-  // The integer cases are represented by the repeated int64 field keys below.
-  repeated int64 keys = 3;
-
-  // If keys are strings, they are represented by the repeated bytes field
-  // string_keys below.
-  repeated bytes string_keys = 4;
-
-  // MapProto values are represented in a SequenceProto of the same length as the
-  // repeated keys field and have to be one of the following data types
-  // TENSOR, SPARSE_TENSOR, MAP, SEQUENCE.
-  optional SequenceProto values = 5;
-}
-
-// For using protobuf-lite
-option optimize_for = LITE_RUNTIME;
diff --git a/triton/src/onnx/onnx-ml.proto b/triton/src/onnx/onnx-ml.proto
deleted file mode 100644
index 8f73a8bb9f..0000000000
--- a/triton/src/onnx/onnx-ml.proto
+++ /dev/null
@@ -1,742 +0,0 @@
-//
-// WARNING: This file is automatically generated!  Please edit onnx.in.proto.
-//
-
-
-// SPDX-License-Identifier: Apache-2.0
-
-
-syntax = "proto2";
-
-package onnx;
-
-// Overview
-//
-// ONNX is an open specification that is comprised of the following components:
-//
-// 1)  A definition of an extensible computation graph model.
-// 2)  Definitions of standard data types.
-// 3)  Definitions of built-in operators.
-//
-// This document describes the syntax of models and their computation graphs,
-// as well as the standard data types. Together, they are referred to as the ONNX
-// Intermediate Representation, or 'IR' for short.
-//
-// The normative semantic specification of the ONNX IR is found in docs/IR.md.
-// Definitions of the built-in neural network operators may be found in docs/Operators.md.
-// Definitions of the built-in classical machine learning operators may be found in
-// docs/Operators-ml.md.
-
-// Notes
-//
-// Release
-//
-// We are still in the very early stage of defining ONNX. The current
-// version of ONNX is a starting point. While we are actively working
-// towards a complete spec, we would like to get the community involved
-// by sharing our working version of ONNX.
-//
-// Protobuf compatibility
-//
-// To simplify framework compatibility, ONNX is defined using the subset of protobuf
-// that is compatible with both protobuf v2 and v3. This means that we do not use any
-// protobuf features that are only available in one of the two versions.
-//
-// Here are the most notable contortions we have to carry out to work around
-// these limitations:
-//
-//   - No 'map' (added protobuf 3.0). We instead represent mappings as lists
-//     of key-value pairs, where order does not matter and duplicates
-//     are not allowed.
-
-
-// Versioning
-//
-// ONNX versioning is specified in docs/IR.md and elaborated on in docs/Versioning.md
-//
-// To be compatible with both proto2 and proto3, we will use a version number
-// that is not defined by the default value but an explicit enum number.
-enum Version {
-  // proto3 requires the first enum value to be zero.
-  // We add this just to appease the compiler.
-  _START_VERSION = 0;
-  // The version field is always serialized and we will use it to store the
-  // version that the  graph is generated from. This helps us set up version
-  // control.
-  // For the IR, we are using simple numbers starting with 0x00000001,
-  // which was the version we published on Oct 10, 2017.
-  IR_VERSION_2017_10_10 = 0x0000000000000001;
-
-  // IR_VERSION 2 published on Oct 30, 2017
-  // - Added type discriminator to AttributeProto to support proto3 users
-  IR_VERSION_2017_10_30 = 0x0000000000000002;
-
-  // IR VERSION 3 published on Nov 3, 2017
-  // - For operator versioning:
-  //    - Added new message OperatorSetIdProto
-  //    - Added opset_import in ModelProto
-  // - For vendor extensions, added domain in NodeProto
-  IR_VERSION_2017_11_3 = 0x0000000000000003;
-
-  // IR VERSION 4 published on Jan 22, 2019
-  // - Relax constraint that initializers should be a subset of graph inputs
-  // - Add type BFLOAT16
-  IR_VERSION_2019_1_22 = 0x0000000000000004;
-
-  // IR VERSION 5 published on March 18, 2019
-  // - Add message TensorAnnotation.
-  // - Add quantization annotation in GraphProto to map tensor with its scale and zero point quantization parameters.
-  IR_VERSION_2019_3_18 = 0x0000000000000005;
-
-  // IR VERSION 6 published on Sep 19, 2019
-  // - Add support for sparse tensor constants stored in model.
-  //   - Add message SparseTensorProto
-  //   - Add sparse initializers
-  IR_VERSION_2019_9_19 = 0x0000000000000006;
-
-  // IR VERSION 7 published on <TBD>
-  // - Add support to allow function body graph to rely on multiple external opreator sets.
-  // - Add a list to promote inference graph's initializers to global and
-  //   mutable variables. Global variables are visible in all graphs of the
-  //   stored models.
-  // - Add message TrainingInfoProto to store initialization
-  //   method and training algorithm. The execution of TrainingInfoProto
-  //   can modify the values of mutable variables.
-  // - Implicitly add inference graph into each TrainingInfoProto's algorithm.
-  IR_VERSION = 0x0000000000000007;
-}
-
-// Attributes
-//
-// A named attribute containing either singular float, integer, string, graph,
-// and tensor values, or repeated float, integer, string, graph, and tensor values.
-// An AttributeProto MUST contain the name field, and *only one* of the
-// following content fields, effectively enforcing a C/C++ union equivalent.
-message AttributeProto {
-
-  // Note: this enum is structurally identical to the OpSchema::AttrType
-  // enum defined in schema.h.  If you rev one, you likely need to rev the other.
-  enum AttributeType {
-    UNDEFINED = 0;
-    FLOAT = 1;
-    INT = 2;
-    STRING = 3;
-    TENSOR = 4;
-    GRAPH = 5;
-    SPARSE_TENSOR = 11;
-
-    FLOATS = 6;
-    INTS = 7;
-    STRINGS = 8;
-    TENSORS = 9;
-    GRAPHS = 10;
-    SPARSE_TENSORS = 12;
-  }
-
-  // The name field MUST be present for this version of the IR.
-  optional string name = 1;           // namespace Attribute
-
-  // if ref_attr_name is not empty, ref_attr_name is the attribute name in parent function.
-  // In this case, this AttributeProto does not contain data, and it's a reference of attribute
-  // in parent scope.
-  // NOTE: This should ONLY be used in function (sub-graph). It's invalid to be used in main graph.
-  optional string ref_attr_name = 21;
-
-  // A human-readable documentation for this attribute. Markdown is allowed.
-  optional string doc_string = 13;
-
-  // The type field MUST be present for this version of the IR.
-  // For 0.0.1 versions of the IR, this field was not defined, and
-  // implementations needed to use has_field heuristics to determine
-  // which value field was in use.  For IR_VERSION 0.0.2 or later, this
-  // field MUST be set and match the f|i|s|t|... field in use.  This
-  // change was made to accommodate proto3 implementations.
-  optional AttributeType type = 20;   // discriminator that indicates which field below is in use
-
-  // Exactly ONE of the following fields must be present for this version of the IR
-  optional float f = 2;               // float
-  optional int64 i = 3;               // int
-  optional bytes s = 4;               // UTF-8 string
-  optional TensorProto t = 5;         // tensor value
-  optional GraphProto g = 6;          // graph
-  optional SparseTensorProto sparse_tensor = 22;  // sparse tensor value
-  // Do not use field below, it's deprecated.
-  // optional ValueProto v = 12;         // value - subsumes everything but graph
-
-  repeated float floats = 7;          // list of floats
-  repeated int64 ints = 8;            // list of ints
-  repeated bytes strings = 9;         // list of UTF-8 strings
-  repeated TensorProto tensors = 10;  // list of tensors
-  repeated GraphProto graphs = 11;    // list of graph
-  repeated SparseTensorProto sparse_tensors = 23; // list of sparse tensors
-}
-
-// Defines information on value, including the name, the type, and
-// the shape of the value.
-message ValueInfoProto {
-  // This field MUST be present in this version of the IR.
-  optional string name = 1;     // namespace Value
-  // This field MUST be present in this version of the IR for
-  // inputs and outputs of the top-level graph.
-  optional TypeProto type = 2;
-  // A human-readable documentation for this value. Markdown is allowed.
-  optional string doc_string = 3;
-}
-
-// Nodes
-//
-// Computation graphs are made up of a DAG of nodes, which represent what is
-// commonly called a "layer" or "pipeline stage" in machine learning frameworks.
-//
-// For example, it can be a node of type "Conv" that takes in an image, a filter
-// tensor and a bias tensor, and produces the convolved output.
-message NodeProto {
-  repeated string input = 1;    // namespace Value
-  repeated string output = 2;   // namespace Value
-
-  // An optional identifier for this node in a graph.
-  // This field MAY be absent in ths version of the IR.
-  optional string name = 3;     // namespace Node
-
-  // The symbolic identifier of the Operator to execute.
-  optional string op_type = 4;  // namespace Operator
-  // The domain of the OperatorSet that specifies the operator named by op_type.
-  optional string domain = 7;   // namespace Domain
-
-  // Additional named attributes.
-  repeated AttributeProto attribute = 5;
-
-  // A human-readable documentation for this node. Markdown is allowed.
-  optional string doc_string = 6;
-}
-
-// Training information
-// TrainingInfoProto stores information for training a model.
-// In particular, this defines two functionalities: an initialization-step
-// and a training-algorithm-step. Initialization resets the model
-// back to its original state as if no training has been performed.
-// Training algorithm improves the model based on input data.
-//
-// The semantics of the initialization-step is that the initializers
-// in ModelProto.graph and in TrainingInfoProto.algorithm are first
-// initialized as specified by the initializers in the graph, and then
-// updated by the "initialization_binding" in every instance in
-// ModelProto.training_info.
-//
-// The field "algorithm" defines a computation graph which represents a
-// training algorithm's step. After the execution of a
-// TrainingInfoProto.algorithm, the initializers specified by "update_binding"
-// may be immediately updated. If the targeted training algorithm contains
-// consecutive update steps (such as block coordinate descent methods),
-// the user needs to create a TrainingInfoProto for each step.
-message TrainingInfoProto {
-  // This field describes a graph to compute the initial tensors
-  // upon starting the training process. Initialization graph has no input
-  // and can have multiple outputs. Usually, trainable tensors in neural
-  // networks are randomly initialized. To achieve that, for each tensor,
-  // the user can put a random number operator such as RandomNormal or
-  // RandomUniform in TrainingInfoProto.initialization.node and assign its
-  // random output to the specific tensor using "initialization_binding".
-  // This graph can also set the initializers in "algorithm" in the same
-  // TrainingInfoProto; a use case is resetting the number of training
-  // iteration to zero.
-  //
-  // By default, this field is an empty graph and its evaluation does not
-  // produce any output. Thus, no initializer would be changed by default.
-  optional GraphProto initialization = 1;
-
-  // This field represents a training algorithm step. Given required inputs,
-  // it computes outputs to update initializers in its own or inference graph's
-  // initializer lists. In general, this field contains loss node, gradient node,
-  // optimizer node, increment of iteration count.
-  //
-  // An execution of the training algorithm step is performed by executing the
-  // graph obtained by combining the inference graph (namely "ModelProto.graph")
-  // and the "algorithm" graph. That is, the actual the actual
-  // input/initializer/output/node/value_info/sparse_initializer list of
-  // the training graph is the concatenation of
-  // "ModelProto.graph.input/initializer/output/node/value_info/sparse_initializer"
-  // and "algorithm.input/initializer/output/node/value_info/sparse_initializer"
-  // in that order. This combined graph must satisfy the normal ONNX conditions.
-  // Now, let's provide a visualization of graph combination for clarity.
-  // Let the inference graph (i.e., "ModelProto.graph") be
-  //    tensor_a, tensor_b -> MatMul -> tensor_c -> Sigmoid -> tensor_d
-  // and the "algorithm" graph be
-  //    tensor_d -> Add -> tensor_e
-  // The combination process results
-  //    tensor_a, tensor_b -> MatMul -> tensor_c -> Sigmoid -> tensor_d -> Add -> tensor_e
-  //
-  // Notice that an input of a node in the "algorithm" graph may reference the
-  // output of a node in the inference graph (but not the other way round). Also, inference
-  // node cannot reference inputs of "algorithm". With these restrictions, inference graph
-  // can always be run independently without training information.
-  //
-  // By default, this field is an empty graph and its evaluation does not
-  // produce any output. Evaluating the default training step never
-  // update any initializers.
-  optional GraphProto algorithm = 2;
-
-  // This field specifies the bindings from the outputs of "initialization" to
-  // some initializers in "ModelProto.graph.initializer" and
-  // the "algorithm.initializer" in the same TrainingInfoProto.
-  // See "update_binding" below for details.
-  //
-  // By default, this field is empty and no initializer would be changed
-  // by the execution of "initialization".
-  repeated StringStringEntryProto initialization_binding = 3;
-
-  // Gradient-based training is usually an iterative procedure. In one gradient
-  // descent iteration, we apply
-  //
-  // x = x - r * g
-  //
-  // where "x" is the optimized tensor, "r" stands for learning rate, and "g" is
-  // gradient of "x" with respect to a chosen loss. To avoid adding assignments
-  // into the training graph, we split the update equation into
-  //
-  // y = x - r * g
-  // x = y
-  //
-  // The user needs to save "y = x - r * g" into TrainingInfoProto.algorithm. To
-  // tell that "y" should be assigned to "x", the field "update_binding" may
-  // contain a key-value pair of strings, "x" (key of StringStringEntryProto)
-  // and "y" (value of StringStringEntryProto).
-  // For a neural network with multiple trainable (mutable) tensors, there can
-  // be multiple key-value pairs in "update_binding".
-  //
-  // The initializers appears as keys in "update_binding" are considered
-  // mutable variables. This implies some behaviors
-  // as described below.
-  //
-  //  1. We have only unique keys in all "update_binding"s so that two
-  //     variables may not have the same name. This ensures that one
-  //     variable is assigned up to once.
-  //  2. The keys must appear in names of "ModelProto.graph.initializer" or
-  //     "TrainingInfoProto.algorithm.initializer".
-  //  3. The values must be output names of "algorithm" or "ModelProto.graph.output".
-  //  4. Mutable variables are initialized to the value specified by the
-  //     corresponding initializer, and then potentially updated by
-  //     "initializer_binding"s and "update_binding"s in "TrainingInfoProto"s.
-  //
-  // This field usually contains names of trainable tensors
-  // (in ModelProto.graph), optimizer states such as momentums in advanced
-  // stochastic gradient methods (in TrainingInfoProto.graph),
-  // and number of training iterations (in TrainingInfoProto.graph).
-  //
-  // By default, this field is empty and no initializer would be changed
-  // by the execution of "algorithm".
-  repeated StringStringEntryProto update_binding = 4;
-}
-
-// Models
-//
-// ModelProto is a top-level file/container format for bundling a ML model and
-// associating its computation graph with metadata.
-//
-// The semantics of the model are described by the associated GraphProto's.
-message ModelProto {
-  // The version of the IR this model targets. See Version enum above.
-  // This field MUST be present.
-  optional int64 ir_version = 1;
-
-  // The OperatorSets this model relies on.
-  // All ModelProtos MUST have at least one entry that
-  // specifies which version of the ONNX OperatorSet is
-  // being imported.
-  //
-  // All nodes in the ModelProto's graph will bind against the operator
-  // with the same-domain/same-op_type operator with the HIGHEST version
-  // in the referenced operator sets.
-  repeated OperatorSetIdProto opset_import = 8;
-
-  // The name of the framework or tool used to generate this model.
-  // This field SHOULD be present to indicate which implementation/tool/framework
-  // emitted the model.
-  optional string producer_name = 2;
-
-  // The version of the framework or tool used to generate this model.
-  // This field SHOULD be present to indicate which implementation/tool/framework
-  // emitted the model.
-  optional string producer_version = 3;
-
-  // Domain name of the model.
-  // We use reverse domain names as name space indicators. For example:
-  // `com.facebook.fair` or `com.microsoft.cognitiveservices`
-  //
-  // Together with `model_version` and GraphProto.name, this forms the unique identity of
-  // the graph.
-  optional string domain = 4;
-
-  // The version of the graph encoded. See Version enum below.
-  optional int64 model_version = 5;
-
-  // A human-readable documentation for this model. Markdown is allowed.
-  optional string doc_string = 6;
-
-  // The parameterized graph that is evaluated to execute the model.
-  optional GraphProto graph = 7;
-
-  // Named metadata values; keys should be distinct.
-  repeated StringStringEntryProto metadata_props = 14;
-
-  // Training-specific information. Sequentially executing all stored
-  // `TrainingInfoProto.algorithm`s and assigning their outputs following
-  // the corresponding `TrainingInfoProto.update_binding`s is one training
-  // iteration. Similarly, to initialize the model
-  // (as if training hasn't happened), the user should sequentially execute
-  // all stored `TrainingInfoProto.initialization`s and assigns their outputs
-  // using `TrainingInfoProto.initialization_binding`s.
-  //
-  // If this field is empty, the training behavior of the model is undefined.
-  repeated TrainingInfoProto training_info = 20;
-};
-
-// StringStringEntryProto follows the pattern for cross-proto-version maps.
-// See https://developers.google.com/protocol-buffers/docs/proto3#maps
-message StringStringEntryProto {
-  optional string key = 1;
-  optional string value= 2;
-};
-
-message TensorAnnotation {
-  optional string tensor_name = 1;
-  // <key, value> pairs to annotate tensor specified by <tensor_name> above.
-  // The keys used in the mapping below must be pre-defined in ONNX spec.
-  // For example, for 8-bit linear quantization case, 'SCALE_TENSOR', 'ZERO_POINT_TENSOR' will be pre-defined as
-  // quantization parameter keys.
-  repeated StringStringEntryProto quant_parameter_tensor_names = 2;
-}
-
-
-
-// Graphs
-//
-// A graph defines the computational logic of a model and is comprised of a parameterized
-// list of nodes that form a directed acyclic graph based on their inputs and outputs.
-// This is the equivalent of the "network" or "graph" in many deep learning
-// frameworks.
-message GraphProto {
-  // The nodes in the graph, sorted topologically.
-  repeated NodeProto node = 1;
-
-  // The name of the graph.
-  optional string name = 2;   // namespace Graph
-
-  // A list of named tensor values, used to specify constant inputs of the graph.
-  // Each initializer (both TensorProto as well SparseTensorProto) MUST have a name.
-  // The name MUST be unique across both initializer and sparse_initializer,
-  // but the name MAY also appear in the input list.
-  repeated TensorProto initializer = 5;
-
-  // Initializers (see above) stored in sparse format.
-  repeated SparseTensorProto sparse_initializer = 15;
-
-  // A human-readable documentation for this graph. Markdown is allowed.
-  optional string doc_string = 10;
-
-  // The inputs and outputs of the graph.
-  repeated ValueInfoProto input = 11;
-  repeated ValueInfoProto output = 12;
-
-  // Information for the values in the graph. The ValueInfoProto.name's
-  // must be distinct. It is optional for a value to appear in value_info list.
-  repeated ValueInfoProto value_info = 13;
-
-  // This field carries information to indicate the mapping among a tensor and its
-  // quantization parameter tensors. For example:
-  // For tensor 'a', it may have {'SCALE_TENSOR', 'a_scale'} and {'ZERO_POINT_TENSOR', 'a_zero_point'} annotated,
-  // which means, tensor 'a_scale' and tensor 'a_zero_point' are scale and zero point of tensor 'a' in the model.
-  repeated TensorAnnotation quantization_annotation = 14;
-
-  // DO NOT USE the following fields, they were deprecated from earlier versions.
-  // repeated string input = 3;
-  // repeated string output = 4;
-  // optional int64 ir_version = 6;
-  // optional int64 producer_version = 7;
-  // optional string producer_tag = 8;
-  // optional string domain = 9;
-}
-
-// Tensors
-//
-// A serialized tensor value.
-message TensorProto {
-  enum DataType {
-    UNDEFINED = 0;
-    // Basic types.
-    FLOAT = 1;   // float
-    UINT8 = 2;   // uint8_t
-    INT8 = 3;    // int8_t
-    UINT16 = 4;  // uint16_t
-    INT16 = 5;   // int16_t
-    INT32 = 6;   // int32_t
-    INT64 = 7;   // int64_t
-    STRING = 8;  // string
-    BOOL = 9;    // bool
-
-    // IEEE754 half-precision floating-point format (16 bits wide).
-    // This format has 1 sign bit, 5 exponent bits, and 10 mantissa bits.
-    FLOAT16 = 10;
-
-    DOUBLE = 11;
-    UINT32 = 12;
-    UINT64 = 13;
-    COMPLEX64 = 14;     // complex with float32 real and imaginary components
-    COMPLEX128 = 15;    // complex with float64 real and imaginary components
-
-    // Non-IEEE floating-point format based on IEEE754 single-precision
-    // floating-point number truncated to 16 bits.
-    // This format has 1 sign bit, 8 exponent bits, and 7 mantissa bits.
-    BFLOAT16 = 16;
-
-    // Future extensions go here.
-  }
-
-  // The shape of the tensor.
-  repeated int64 dims = 1;
-
-  // The data type of the tensor.
-  // This field MUST have a valid TensorProto.DataType value
-  optional int32 data_type = 2;
-
-  // For very large tensors, we may want to store them in chunks, in which
-  // case the following fields will specify the segment that is stored in
-  // the current TensorProto.
-  message Segment {
-    optional int64 begin = 1;
-    optional int64 end = 2;
-  }
-  optional Segment segment = 3;
-
-  // Tensor content must be organized in row-major order.
-  //
-  // Depending on the data_type field, exactly one of the fields below with
-  // name ending in _data is used to store the elements of the tensor.
-
-  // For float and complex64 values
-  // Complex64 tensors are encoded as a single array of floats,
-  // with the real components appearing in odd numbered positions,
-  // and the corresponding imaginary component appearing in the
-  // subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
-  // is encoded as [1.0, 2.0 ,3.0 ,4.0]
-  // When this field is present, the data_type field MUST be FLOAT or COMPLEX64.
-  repeated float float_data = 4 [packed = true];
-
-  // For int32, uint8, int8, uint16, int16, bool, and float16 values
-  // float16 values must be bit-wise converted to an uint16_t prior
-  // to writing to the buffer.
-  // When this field is present, the data_type field MUST be
-  // INT32, INT16, INT8, UINT16, UINT8, BOOL, or FLOAT16
-  repeated int32 int32_data = 5 [packed = true];
-
-  // For strings.
-  // Each element of string_data is a UTF-8 encoded Unicode
-  // string. No trailing null, no leading BOM. The protobuf "string"
-  // scalar type is not used to match ML community conventions.
-  // When this field is present, the data_type field MUST be STRING
-  repeated bytes string_data = 6;
-
-  // For int64.
-  // When this field is present, the data_type field MUST be INT64
-  repeated int64 int64_data = 7 [packed = true];
-
-  // Optionally, a name for the tensor.
-  optional string name = 8; // namespace Value
-
-  // A human-readable documentation for this tensor. Markdown is allowed.
-  optional string doc_string = 12;
-
-  // Serializations can either use one of the fields above, or use this
-  // raw bytes field. The only exception is the string case, where one is
-  // required to store the content in the repeated bytes string_data field.
-  //
-  // When this raw_data field is used to store tensor value, elements MUST
-  // be stored in as fixed-width, little-endian order.
-  // Floating-point data types MUST be stored in IEEE 754 format.
-  // Complex64 elements must be written as two consecutive FLOAT values, real component first.
-  // Complex128 elements must be written as two consecutive DOUBLE values, real component first.
-  // Boolean type MUST be written one byte per tensor element (00000001 for true, 00000000 for false).
-  //
-  // Note: the advantage of specific field rather than the raw_data field is
-  // that in some cases (e.g. int data), protobuf does a better packing via
-  // variable length storage, and may lead to smaller binary footprint.
-  // When this field is present, the data_type field MUST NOT be STRING or UNDEFINED
-  optional bytes raw_data = 9;
-
-  // Data can be stored inside the protobuf file using type-specific fields or raw_data.
-  // Alternatively, raw bytes data can be stored in an external file, using the external_data field.
-  // external_data stores key-value pairs describing data location. Recognized keys are:
-  // - "location" (required) - POSIX filesystem path relative to the directory where the ONNX
-  //                           protobuf model was stored
-  // - "offset" (optional) - position of byte at which stored data begins. Integer stored as string.
-  //                         Offset values SHOULD be multiples 4096 (page size) to enable mmap support.
-  // - "length" (optional) - number of bytes containing data. Integer stored as string.
-  // - "checksum" (optional) - SHA1 digest of file specified in under 'location' key.
-  repeated StringStringEntryProto external_data = 13;
-
-  // Location of the data for this tensor. MUST be one of:
-  // - DEFAULT - data stored inside the protobuf message. Data is stored in raw_data (if set) otherwise in type-specified field.
-  // - EXTERNAL - data stored in an external location as described by external_data field.
-  enum DataLocation {
-    DEFAULT = 0;
-    EXTERNAL = 1;
-  }
-
-  // If value not set, data is stored in raw_data (if set) otherwise in type-specified field.
-  optional DataLocation data_location = 14;
-
-  // For double
-  // Complex128 tensors are encoded as a single array of doubles,
-  // with the real components appearing in odd numbered positions,
-  // and the corresponding imaginary component appearing in the
-  // subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
-  // is encoded as [1.0, 2.0 ,3.0 ,4.0]
-  // When this field is present, the data_type field MUST be DOUBLE or COMPLEX128
-  repeated double double_data = 10 [packed = true];
-
-  // For uint64 and uint32 values
-  // When this field is present, the data_type field MUST be
-  // UINT32 or UINT64
-  repeated uint64 uint64_data = 11 [packed = true];
-}
-
-// A serialized sparse-tensor value
-message SparseTensorProto {
-  // The sequence of non-default values are encoded as a tensor of shape [NNZ].
-  // The default-value is zero for numeric tensors, and empty-string for string tensors.
-  // values must have a non-empty name present which serves as a name for SparseTensorProto
-  // when used in sparse_initializer list.
-  optional TensorProto values = 1;
-
-  // The indices of the non-default values, which may be stored in one of two formats.
-  // (a) Indices can be a tensor of shape [NNZ, rank] with the [i,j]-th value
-  // corresponding to the j-th index of the i-th value (in the values tensor).
-  // (b) Indices can be a tensor of shape [NNZ], in which case the i-th value
-  // must be the linearized-index of the i-th value (in the values tensor).
-  // The linearized-index can be converted into an index tuple (k_1,...,k_rank)
-  // using the shape provided below.
-  // The indices must appear in ascending order without duplication.
-  // In the first format, the ordering is lexicographic-ordering:
-  // e.g., index-value [1,4] must appear before [2,1]
-  optional TensorProto indices = 2;
-
-  // The shape of the underlying dense-tensor: [dim_1, dim_2, ... dim_rank]
-  repeated int64 dims = 3;
-}
-
-// Defines a tensor shape. A dimension can be either an integer value
-// or a symbolic variable. A symbolic variable represents an unknown
-// dimension.
-message TensorShapeProto {
-  message Dimension {
-    oneof value {
-      int64 dim_value = 1;
-      string dim_param = 2;   // namespace Shape
-    };
-    // Standard denotation can optionally be used to denote tensor
-    // dimensions with standard semantic descriptions to ensure
-    // that operations are applied to the correct axis of a tensor.
-    // Refer to https://github.com/onnx/onnx/blob/master/docs/DimensionDenotation.md#denotation-definition
-    // for pre-defined dimension denotations.
-    optional string denotation = 3;
-  };
-  repeated Dimension dim = 1;
-}
-
-// Types
-//
-// The standard ONNX data types.
-message TypeProto {
-
-  message Tensor {
-    // This field MUST NOT have the value of UNDEFINED
-    // This field MUST have a valid TensorProto.DataType value
-    // This field MUST be present for this version of the IR.
-    optional int32 elem_type = 1;
-    optional TensorShapeProto shape = 2;
-  }
-
-  // repeated T
-  message Sequence {
-    // The type and optional shape of each element of the sequence.
-    // This field MUST be present for this version of the IR.
-    optional TypeProto elem_type = 1;
-  };
-
-  // map<K,V>
-  message Map {
-    // This field MUST have a valid TensorProto.DataType value
-    // This field MUST be present for this version of the IR.
-    // This field MUST refer to an integral type ([U]INT{8|16|32|64}) or STRING
-    optional int32 key_type = 1;
-    // This field MUST be present for this version of the IR.
-    optional TypeProto value_type = 2;
-  };
-
-
-  message SparseTensor {
-    // This field MUST NOT have the value of UNDEFINED
-    // This field MUST have a valid TensorProto.DataType value
-    // This field MUST be present for this version of the IR.
-    optional int32 elem_type = 1;
-    optional TensorShapeProto shape = 2;
-  }
-
-  message Opaque {
-    // When missing, the domain is the same as the model's.
-    optional string domain = 1;
-    // The name is optional but significant when provided.
-    optional string name = 2;
-    // parameters that help defining the type
-    // DEPRECATED do not use.
-    // repeated TypeProto parameters = 3;
-  }
-
-
-  oneof value {
-    // The type of a tensor.
-    Tensor tensor_type = 1;
-
-    // NOTE:  DNN-only implementations of ONNX MAY elect to not support non-tensor values
-    //        as input and output to graphs and nodes. These types are needed to naturally
-    //        support classical ML operators.  DNN operators SHOULD restrict their input
-    //        and output types to tensors.
-
-    // The type of a sequence.
-    Sequence sequence_type = 4;
-
-    // The type of a map.
-    Map map_type = 5;
-
-
-    SparseTensor sparse_tensor_type = 8;
-
-    Opaque opaque_type = 7;
-
-  }
-
-  // An optional denotation can be used to denote the whole
-  // type with a standard semantic description as to what is
-  // stored inside. Refer to https://github.com/onnx/onnx/blob/master/docs/TypeDenotation.md#type-denotation-definition
-  // for pre-defined type denotations.
-  optional string denotation = 6;
-}
-
-// Operator Sets
-//
-// OperatorSets are uniquely identified by a (domain, opset_version) pair.
-message OperatorSetIdProto {
-  // The domain of the operator set being identified.
-  // The empty string ("") or absence of this field implies the operator
-  // set that is defined as part of the ONNX specification.
-  // This field MUST be present in this version of the IR when referring to any other operator set.
-  optional string domain = 1;
-
-  // The version of the operator set being identified.
-  // This field MUST be present in this version of the IR.
-  optional int64 version = 2;
-}
-
-
-// For using protobuf-lite
-option optimize_for = LITE_RUNTIME;
diff --git a/triton/src/onnx/onnx-operators-ml.proto b/triton/src/onnx/onnx-operators-ml.proto
deleted file mode 100644
index 11ebad0dc2..0000000000
--- a/triton/src/onnx/onnx-operators-ml.proto
+++ /dev/null
@@ -1,185 +0,0 @@
-//
-// WARNING: This file is automatically generated!  Please edit onnx.in.proto.
-//
-
-
-// Copyright (c) ONNX Project Contributors.
-// Licensed under the MIT license.
-
-syntax = "proto2";
-
-package onnx;
-import "onnx/onnx-ml.proto";
-
-//
-// This file contains the proto definitions for OperatorSetProto and
-// OperatorProto.  OperatorSetProtos are used to describe a versioned
-// set of operators that can be used by a ModelProto.
-//
-// Like ModelProto, OperatorSetProto is defined as a top-level file/wire
-// format, however their usage is different.
-//
-// ModelProto files are used to describe executable graphs that can be
-// executed directly by a framework, runtime, or engine.
-//
-// OperatorSetProto files are used to describe a set of operators that are
-// available in a given environment.  The file TBD.TBD is the OperatorSetProto
-// that describes the ONNX standard operators.
-//
-
-// Operator/function status.
-enum OperatorStatus {
-    EXPERIMENTAL = 0;
-    STABLE = 1;
-}
-
-message FunctionProto {
-  // The name of the function, similar usage of op_type in OperatorProto.
-  optional string name = 1;
-
-  // The first version of a function set which contains this function.
-  // When there's any breaking change for this function, the function set
-  // contains the function needs to bump its version, and since_version of
-  // the updated function will be changed to the updated function set version.  
-  optional int64 since_version = 2;
-
-  // This field indicates whether the syntax, semantics, or presence
-  // of this function is in an experimental or stable stage. Once an
-  // function is published as STABLE, its syntax and semantics MUST NOT
-  // change in subsequent versions of the operator set.
-  // When a function is published as EXPERIMENTAL, the syntax and semantics
-  // of the function MAY change across operator set versions.
-  // Functions "become" stable by deprecating the experimental version and
-  // introducing a new stable function with the same name.
-  optional OperatorStatus status = 3;
-
-  // The inputs and outputs of the function.
-  repeated string input = 4;
-  repeated string output = 5;
-
-  // The attributes of the function.
-  repeated string attribute= 6;
-
-  // The nodes in the function.
-  repeated NodeProto node = 7;
-  // A human-readable documentation for this function. Markdown is allowed.
-  optional string doc_string = 8;
-
-  // The OperatorSets this function body (graph) relies on.
-  // A FunctionProto body (graph) may implicitly rely on the OperatorSet that
-  // this function belongs to. It can also explicitly rely on more OperatorSets
-  // with this field specified.
-  //
-  // All nodes in the function body (graph) will bind against the operator
-  // with the same-domain/same-op_type operator with the HIGHEST version
-  // in the referenced operator sets. This means at most one version can be relied
-  // for one domain.
-  repeated OperatorSetIdProto opset_import = 9;
-}
-
-// An OperatorProto represents the immutable specification of the signature
-// and semantics of an operator.
-//
-// Operators are declared as part of an OperatorSet, which also defines the
-// domain name for the set.
-//
-// Operators are uniquely identified by a three part identifier
-//   (domain, op_type, since_version)
-// where
-//   *domain* is the domain of an operator set that
-//      contains this operator specification.
-//
-//   *op_type* is the name of the operator as referenced by a
-//      NodeProto.op_type
-//
-//   *since_version* is the version of the operator set that
-//      this operator was initially declared in.
-//
-message OperatorProto {
-  // The name of the operator within a domain.
-  // This field MUST be present in this version of the IR.
-  optional string op_type = 1;
-
-  // The version of the operator set that first introduced this
-  // operator. This value MUST be the same value as the
-  // opset_version of the operator set that first published this operator.
-  // Subsequent versions of the operator set MUST NOT alter the signature
-  // or semantics of the operator once published as STABLE.
-  // This field MUST be present in this version of the IR.
-  optional int64 since_version = 2;
-
-  // This field indicates whether the syntax, semantics, or presence
-  // of this operator is in an experimental or stable stage. Once an
-  // operator is published as STABLE, it's syntax and semantics MUST NOT
-  // change in subsequent versions of the operator set.
-  // When an operator is published as EXPERIMENTAL, the syntax and semantics
-  // of the operator MAY change across operator set versions.
-  // Operators "become" stable by deprecating the experimental version and
-  // introducing a new stable operator with the same op_type.
-  optional OperatorStatus status = 3;
-
-  // Eventually we will declare the signature of the operator here
-
-  // A human-readable documentation for this operator. Markdown is allowed.
-  optional string doc_string = 10;
-}
-
-// An OperatorSetProto represents an immutable set of immutable operator
-// specifications.
-//
-// The domain of the set (OperatorSetProto.domain) is a reverse-DNS name
-// that disambiguates operator sets defined by independent entities.
-//
-// The version of the set (opset_version) is a monotonically increasing
-// integer that indicates changes to the membership of the operator set.
-//
-//
-// Operator sets are uniquely identified by a two part identifier (domain, opset_version)
-//
-// Like ModelProto, OperatorSetProto is intended as a top-level file/wire format,
-// and thus has the standard format headers in addition to the operator set information.
-//
-message OperatorSetProto {
-  // All OperatorSetProtos start with a distingushed byte sequence to disambiguate
-  // protobuf files containing OperatorSets from other content.
-  // This field MUST be "ONNXOPSET"
-  // This field MUST be present in this version of the IR
-  optional string magic = 1;
-
-  // All OperatorSetProtos indicate the version of the IR syntax and semantics
-  // they adhere to. It is always IR_VERSION.
-  // This field MUST be present in this version of the IR
-  optional int64 ir_version = 2;
-
-  // The prerelease component of the SemVer of the IR.
-  // This field MAY be absent in this version of the IR
-  optional string ir_version_prerelease = 3;
-
-  // The build metadata component of the SemVer of the IR.
-  // This field MAY be absent in this version of the IR
-  optional string ir_build_metadata = 7;
-
-  // Domain name of the operator set, in reverse DNS form (e.g., com.acme.dnnops).
-  optional string domain = 4;
-
-  // The version of the set of operators. This is a simple int value
-  // that is monotonically increasing as new versions of operator set
-  // are published. All operators in this set MUST have version
-  // numbers no greater than opset_version.
-  optional int64 opset_version = 5;
-
-  // A human-readable documentation for this set of operators. Markdown is allowed.
-  optional string doc_string = 6;
-
-  // The operators specified by this operator set.
-  // The (name, version) MUST be unique across all OperatorProtos in operator
-  repeated OperatorProto operator = 8;
-
-  // The functions specified by this operator set.
-  // The (name, version) MUST be unique across all OperatorProtos/FunctionProtos in operator/functions
-  repeated FunctionProto functions = 9;
-}
-
-
-// For using protobuf-lite
-option optimize_for = LITE_RUNTIME;
diff --git a/triton/src/onnx_parser.cc b/triton/src/onnx_parser.cc
deleted file mode 100644
index fdf913fcb9..0000000000
--- a/triton/src/onnx_parser.cc
+++ /dev/null
@@ -1,1485 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "onnx_parser.h"
-
-#include "model.h"
-
-// Legion layers
-#include "operators/binary.h"
-#include "operators/conv2d.h"
-#include "operators/pool2d.h"
-#include "operators/softmax.h"
-#include "operators/unary.h"
-
-#include <google/protobuf/io/coded_stream.h>
-#include <google/protobuf/text_format.h>
-#include <string.h>
-#include <fstream>
-#include <string>
-#include "triton/backend/backend_common.h"
-
-namespace triton { namespace backend { namespace legion {
-
-namespace {
-
-#define RETURN_IF_TYPE_MISMATCH(NODE, ATTRIBUTE, EXPECTED_TYPE)        \
-  do {                                                                 \
-    if (ATTRIBUTE.type() != EXPECTED_TYPE) {                           \
-      return TRITONSERVER_ErrorNew(                                    \
-          TRITONSERVER_ERROR_INVALID_ARG,                              \
-          (std::string("Attribute '") + ATTRIBUTE.name() + "' for '" + \
-           NODE.op_type() + "' layer named '" + NODE.name() +          \
-           "' must have attribute type " +                             \
-           onnx::AttributeProto::AttributeType_Name(EXPECTED_TYPE))    \
-              .c_str());                                               \
-    }                                                                  \
-  } while (false)
-
-
-TRITONSERVER_Error*
-OnnxTypeToDataType(const int32_t element_type, DataType* converted_type)
-{
-  switch (element_type) {
-    case 10 /* FLOAT16 */:
-      *converted_type = DT_HALF;
-      break;
-    case 1 /* FLOAT */:
-      *converted_type = DT_FLOAT;
-      break;
-    case 11 /* DOUBLE */:
-      *converted_type = DT_DOUBLE;
-      break;
-    case 3 /* INT8 */:
-      *converted_type = DT_INT8;
-      break;
-    case 5 /* INT16 */:
-      *converted_type = DT_INT16;
-      break;
-    case 6 /* INT32 */:
-      *converted_type = DT_INT32;
-      break;
-    case 7 /* INT64 */:
-      *converted_type = DT_INT64;
-      break;
-    case 2 /* UINT8 */:
-      *converted_type = DT_UINT8;
-      break;
-    case 4 /* UINT16 */:
-      *converted_type = DT_UINT16;
-      break;
-    case 12 /* UINT32 */:
-      *converted_type = DT_UINT32;
-      break;
-    case 13 /* UINT64 */:
-      *converted_type = DT_UINT64;
-      break;
-    case 9 /* BOOL */:
-      *converted_type = DT_BOOLEAN;
-      break;
-    default:
-      return TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_UNSUPPORTED,
-          (std::string("Unsupported ONNX tensor type '") +
-           onnx::TensorProto_DataType_Name(element_type) + "'")
-              .c_str());
-      break;
-  }
-  return nullptr;  // success
-}
-
-TRITONSERVER_Error*
-ReadTextFile(const std::string& path, std::string* contents)
-{
-  std::ifstream in(path, std::ios::in | std::ios::binary);
-  if (!in) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INTERNAL, std::string(
-                                         "failed to open text file for read " +
-                                         path + ": " + strerror(errno))
-                                         .c_str());
-  }
-
-  in.seekg(0, std::ios::end);
-  contents->resize(in.tellg());
-  in.seekg(0, std::ios::beg);
-  in.read(&(*contents)[0], contents->size());
-  in.close();
-
-  return nullptr;  // success
-}
-
-}  // namespace
-
-std::map<std::string, OnnxParser::ParseFn_t> OnnxParser::op_type_parser_map_{
-    {"Conv", &OnnxParser::ParseConv2D},
-    {"Flatten", &OnnxParser::ParseFlatten},
-    {"AveragePool", &OnnxParser::ParseAveragePool},
-    {"MaxPool", &OnnxParser::ParseMaxPool},
-    {"Softmax", &OnnxParser::ParseSoftmax},
-    {"Relu", &OnnxParser::ParseRelu},
-    {"Add", &OnnxParser::ParseAdd},
-    {"Sub", &OnnxParser::ParseSub},
-    {"Mul", &OnnxParser::ParseMul},
-    {"Identity", &OnnxParser::ParseIdentity},
-    {"Cast", &OnnxParser::ParseCast},
-    {"Tanh", &OnnxParser::ParseTanh},
-    {"Reciprocal", &OnnxParser::ParseReciprocal},
-    {"Sqrt", &OnnxParser::ParseSqrt}};
-
-TRITONSERVER_Error*
-OnnxParser::LoadModel(
-    std::function<const std::vector<Realm::Processor>&(Realm::Processor::Kind)>
-        find_local_processor_fn,
-    LegionModelState* model, const PartitionStrategy* strategy,
-    const std::string& onnx_file,
-    std::vector<std::pair<std::string, Tensor*>>* inputs,
-    std::vector<std::pair<std::string, Tensor*>>* outputs,
-    std::vector<Operator*>* layers)
-{
-  onnx::ModelProto onnx_model;
-  {
-    std::string file_content;
-    RETURN_IF_ERROR(ReadTextFile(onnx_file, &file_content));
-    if (!onnx_model.ParseFromString(file_content)) {
-      return TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_INTERNAL,
-          std::string("failed to parse ONNX model protobuf from " + onnx_file)
-              .c_str());
-    }
-  }
-
-  // Sanity check
-  RETURN_ERROR_IF_FALSE(
-      (strategy != nullptr), TRITONSERVER_ERROR_INVALID_ARG,
-      std::string("failed to parse ONNX model, strategy is not provided"));
-  RETURN_ERROR_IF_FALSE(
-      (strategy->layers.size() == onnx_model.graph().node().size()),
-      TRITONSERVER_ERROR_INVALID_ARG,
-      std::string("failed to parse ONNX model, layer count in strategy does "
-                  "not match the ONNX model"));
-
-  // WIP
-  // [gluo FIXME] should validate the ONNX model (versioning, op set, ONNX
-  // checker-like etc.)
-  OnnxParser parser(
-      find_local_processor_fn, model, strategy, onnx_model, inputs, outputs,
-      layers);
-
-  // Note that the weights specified in 'initializer' may also be specified
-  // in 'input', thus we should parse in "weight, input" order so that we can
-  // filter the weight from input.
-  RETURN_IF_ERROR(parser.ParseWeight(onnx_model.graph()));
-  RETURN_IF_ERROR(parser.ParseInput(onnx_model.graph()));
-
-  for (int idx = 0; idx < onnx_model.graph().node().size(); ++idx) {
-    const auto& node = onnx_model.graph().node(idx);
-    auto parser_it = op_type_parser_map_.find(node.op_type());
-    if (parser_it != op_type_parser_map_.end()) {
-      RETURN_IF_ERROR(
-          (parser_it->second)(&parser, strategy->layers[idx], node));
-    } else {
-      return TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_UNSUPPORTED,
-          (std::string("Layer type '") + node.op_type() +
-           "' is not currently supported")
-              .c_str());
-    }
-  }
-
-  // Output must be parsed at the end, as the output tensors are created while
-  // parsing operators.
-  RETURN_IF_ERROR(parser.ParseOutput(onnx_model.graph()));
-  return nullptr;  // success
-}
-
-OnnxParser::OnnxParser(
-    std::function<const std::vector<Realm::Processor>&(Realm::Processor::Kind)>
-        find_local_processor_fn,
-    LegionModelState* model, const PartitionStrategy* strategy,
-    const onnx::ModelProto& onnx_model,
-    std::vector<std::pair<std::string, Tensor*>>* inputs,
-    std::vector<std::pair<std::string, Tensor*>>* outputs,
-    std::vector<Operator*>* layers)
-    : find_local_processor_fn_(find_local_processor_fn), model_(model),
-      strategy_(strategy), onnx_model_(onnx_model), inputs_(inputs),
-      outputs_(outputs), layers_(layers)
-{
-}
-
-OnnxParser::~OnnxParser()
-{
-  // [gluo FIXME] don't need below if the operators are holding
-  // the smart pointers as well
-  for (auto& tensor : tensors_) {
-    tensor.second.release();
-  }
-}
-
-TRITONSERVER_Error*
-OnnxParser::ParseWeight(const onnx::GraphProto& onnx_graph)
-{
-  if (!onnx_graph.sparse_initializer().empty()) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_UNSUPPORTED,
-        "ONNX sparse initializer is currently not supported");
-  }
-  for (const auto& initializer : onnx_graph.initializer()) {
-    // Only storing the pointer to the protobuf message, need to interact with
-    // the corresponding layer to load data properly
-    weights_.emplace(initializer.name(), &initializer);
-  }
-  return nullptr;  // success
-}
-
-TRITONSERVER_Error*
-OnnxParser::ParseInput(const onnx::GraphProto& onnx_graph)
-{
-  for (const auto& input : onnx_graph.input()) {
-    // ignore weights that are also specified in input
-    if (weights_.find(input.name()) != weights_.end()) {
-      continue;
-    }
-    if (!input.type().has_tensor_type()) {
-      return TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_UNSUPPORTED,
-          (std::string("Type for ONNX input '") + input.name() +
-           "' must be tensor")
-              .c_str());
-    }
-    const auto& type_proto = input.type().tensor_type();
-    DataType type;
-    RETURN_IF_ERROR(OnnxTypeToDataType(type_proto.elem_type(), &type));
-    // FIXME ONNX model that supports batching should have dynamic first
-    // dimension, which is marked as unsupported currently. May need to use
-    // 'max_batch_size' from model config as a hint on handling / allowing
-    // batching.
-    std::vector<size_t> dims;
-    for (const auto& dim : type_proto.shape().dim()) {
-      if (dim.has_dim_param()) {
-        return TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_UNSUPPORTED,
-            "Dynamic tensor shape is not supported");
-      }
-      dims.emplace_back(dim.dim_value());
-    }
-    std::unique_ptr<Tensor> tensor(new Tensor(nullptr, type, dims));
-    inputs_->emplace_back(input.name(), tensor.get());
-    tensors_.emplace(input.name(), std::move(tensor));
-  }
-  return nullptr;  // success
-}
-
-TRITONSERVER_Error*
-OnnxParser::ParseOutput(const onnx::GraphProto& onnx_graph)
-{
-  for (const auto& io : onnx_graph.output()) {
-    auto it = tensors_.find(io.name());
-    if (it == tensors_.end()) {
-      return TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_INVALID_ARG,
-          (std::string("ONNX output '") + io.name() +
-           "' will not be produced by the model")
-              .c_str());
-    }
-    outputs_->emplace_back(io.name(), it->second.get());
-  }
-  return nullptr;  // success
-}
-
-template <int Dim>
-TRITONSERVER_Error*
-OnnxParser::LoadWeight(
-    const LayerStrategy* strategy,
-    std::function<Legion::Rect<Dim>(Realm::Processor)> local_bound_fn,
-    const onnx::TensorProto* weight_proto, Weights* weight)
-{
-  const auto& processors = find_local_processor_fn_(strategy->kind);
-  for (const auto& proc : processors) {
-    if (strategy->is_local_processor(proc)) {
-      size_t proc_idx = strategy->find_local_offset(proc);
-      weight->local_bounds[proc_idx] = Legion::Domain(local_bound_fn(proc));
-      const auto& local_bounds = weight->local_bounds[proc_idx];
-      size_t total_byte_size = sizeof_datatype(weight->type);
-      for (int dim_idx = (weight->bounds.size() - 1); dim_idx >= 0; --dim_idx) {
-        weight->local_strides[proc_idx][dim_idx] = total_byte_size;
-        total_byte_size *=
-            ((local_bounds.hi()[dim_idx] + 1) - local_bounds.lo()[dim_idx]);
-      }
-      weight->local_allocation[proc_idx] = std::malloc(total_byte_size);
-      if (weight->local_allocation[proc_idx] == nullptr) {
-        return TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_INTERNAL,
-            (std::string(
-                 "Failed to allocate local system memory for weight for '" +
-                 std::to_string(weight->owner->op_type) + "' layer named '" +
-                 weight->owner->op_name + "'")
-                 .c_str()));
-      }
-    }
-  }
-
-  // [FIXME] need to expand to be able to load from external files
-  if (weight_proto->has_data_location() &&
-      (weight_proto->data_location() ==
-       onnx::TensorProto::DataLocation::TensorProto_DataLocation_EXTERNAL)) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_UNSUPPORTED,
-        "Loading weight stored out of ONNX file is currently not supported");
-  }
-  const void* weight_ptr = weight_proto->has_data_location()
-                               ? nullptr
-                               : weight_proto->raw_data().data();
-  // boolean value stored in raw_data is represent in 1 byte (00000001 for true,
-  // 00000000 for false), thus special handling is required
-  // https://github.com/onnx/onnx/blob/v1.9.0/onnx/onnx-ml.proto#L558
-  bool is_raw_boolean =
-      ((weight_ptr != nullptr) && (weight->type == DT_BOOLEAN));
-  if (weight_ptr == nullptr) {
-    switch (weight->type) {
-      case DT_INT8:
-      case DT_UINT8:
-      case DT_BOOLEAN:
-      case DT_INT16:
-      case DT_UINT16:
-      case DT_HALF:
-      case DT_INT32:
-        weight_ptr = weight_proto->int32_data().data();
-        break;
-      case DT_FLOAT:
-        weight_ptr = weight_proto->float_data().data();
-        break;
-      case DT_DOUBLE:
-        weight_ptr = weight_proto->double_data().data();
-        break;
-      case DT_INT64:
-        weight_ptr = weight_proto->int64_data().data();
-        break;
-      case DT_UINT32:
-      case DT_UINT64:
-        weight_ptr = weight_proto->uint64_data().data();
-        break;
-      default:
-        return TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_UNSUPPORTED,
-            "Loading weight of unsupported data type");
-        break;
-    }
-  }
-  size_t total_byte_size = sizeof_datatype(weight->type);
-  std::vector<size_t> strides(weight->bounds.size());
-  for (int dim_idx = (weight->bounds.size() - 1); dim_idx >= 0; --dim_idx) {
-    strides[dim_idx] = total_byte_size;
-    total_byte_size *= weight->bounds[dim_idx];
-  }
-  for (size_t proc_idx = 0; proc_idx < MAX_LOCAL_PROCS; ++proc_idx) {
-    if (weight->local_allocation[proc_idx] != nullptr) {
-      RETURN_IF_ERROR(SetElementData(
-          strides, weight->local_bounds[proc_idx],
-          weight->local_strides[proc_idx], 0, is_raw_boolean,
-          reinterpret_cast<const char*>(weight_ptr),
-          reinterpret_cast<char*>(weight->local_allocation[proc_idx])));
-    }
-  }
-  return nullptr;  // success
-}
-
-TRITONSERVER_Error*
-OnnxParser::SetElementData(
-    const std::vector<size_t>& strides, const Legion::Domain& local_bounds,
-    const size_t* local_strides, size_t dim_idx, const bool is_raw_boolean,
-    const char* src_data, char* dst_data)
-{
-  if (dim_idx == (strides.size() - 1)) {
-    if (is_raw_boolean) {
-      // boolean value stored in raw_data is represent in 1 byte (00000001 for
-      // true, 00000000 for false), thus special handling is required
-      // https://github.com/onnx/onnx/blob/v1.9.0/onnx/onnx-ml.proto#L558
-      size_t dst_idx_offset = 0;
-      for (size_t idx = local_bounds.lo()[dim_idx];
-           idx <= local_bounds.hi()[dim_idx]; ++idx) {
-        reinterpret_cast<bool*>(dst_data)[dst_idx_offset] =
-            (reinterpret_cast<const uint8_t*>(src_data)[idx] == 1);
-        ++dst_idx_offset;
-      }
-    } else {
-      // Assuming the layout is always packed in both src and dst, otherwise
-      // the data should be set one element by one element
-      size_t src_offset = strides[dim_idx] * local_bounds.lo()[dim_idx];
-      size_t byte_size = strides[dim_idx] * ((local_bounds.hi()[dim_idx] + 1) -
-                                             local_bounds.lo()[dim_idx]);
-      std::memcpy(dst_data, src_data + src_offset, byte_size);
-    }
-  } else {
-    for (size_t idx = local_bounds.lo()[dim_idx];
-         idx <= local_bounds.hi()[dim_idx]; ++idx) {
-      RETURN_IF_ERROR(SetElementData(
-          strides, local_bounds, local_strides, dim_idx + 1, is_raw_boolean,
-          src_data + strides[dim_idx] * idx,
-          dst_data +
-              local_strides[dim_idx] * (idx - local_bounds.lo()[dim_idx])));
-    }
-  }
-  return nullptr;  // success
-}
-
-TRITONSERVER_Error*
-OnnxParser::ParseConv2D(
-    OnnxParser* parser, const LayerStrategy* strategy,
-    const onnx::NodeProto& onnx_node)
-{
-  // Layer attributes
-  size_t groups = 1;
-  size_t kernel_h = 0;
-  size_t kernel_w = 0;
-  size_t padding_h = 0;
-  size_t padding_w = 0;
-  size_t stride_h = 1;
-  size_t stride_w = 1;
-  size_t dilation_h = 1;
-  size_t dilation_w = 1;
-  for (const auto& attribute : onnx_node.attribute()) {
-    if (attribute.name() == "auto_pad") {
-      RETURN_IF_TYPE_MISMATCH(
-          onnx_node, attribute,
-          onnx::AttributeProto::AttributeType::
-              AttributeProto_AttributeType_STRING);
-      if (attribute.s() != "NOTSET") {
-        return TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_UNSUPPORTED,
-            (std::string("Unsupported attribute value '") + attribute.s() +
-             "' for attribute '" + attribute.name() + "' in '" +
-             onnx_node.op_type() + "' layer named '" + onnx_node.name() +
-             "', currently supported value is 'NOTSET'")
-                .c_str());
-      }
-    } else if (attribute.name() == "dilations") {
-      RETURN_IF_TYPE_MISMATCH(
-          onnx_node, attribute,
-          onnx::AttributeProto::AttributeType::
-              AttributeProto_AttributeType_INTS);
-      for (const auto dilation : attribute.ints()) {
-        if (dilation != 1) {
-          return TRITONSERVER_ErrorNew(
-              TRITONSERVER_ERROR_UNSUPPORTED,
-              (std::string("Unsupported attribute value for attribute '") +
-               attribute.name() + "' in '" + onnx_node.op_type() +
-               "' layer named '" + onnx_node.name() +
-               "', each of the attribute value must be 1")
-                  .c_str());
-        }
-      }
-    } else if (attribute.name() == "group") {
-      RETURN_IF_TYPE_MISMATCH(
-          onnx_node, attribute,
-          onnx::AttributeProto::AttributeType::
-              AttributeProto_AttributeType_INT);
-      groups = attribute.i();
-    } else if (attribute.name() == "kernel_shape") {
-      RETURN_IF_TYPE_MISMATCH(
-          onnx_node, attribute,
-          onnx::AttributeProto::AttributeType::
-              AttributeProto_AttributeType_INTS);
-      if (attribute.ints().size() != 2) {
-        return TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_INVALID_ARG,
-            (std::string("Attribute '") + attribute.name() + "' in '" +
-             onnx_node.op_type() + "' layer named '" + onnx_node.name() +
-             "' must have 2 values, got " +
-             std::to_string(attribute.ints().size()))
-                .c_str());
-      }
-      kernel_h = attribute.ints(0);
-      kernel_w = attribute.ints(1);
-    } else if (attribute.name() == "pads") {
-      RETURN_IF_TYPE_MISMATCH(
-          onnx_node, attribute,
-          onnx::AttributeProto::AttributeType::
-              AttributeProto_AttributeType_INTS);
-      if (attribute.ints().size() != 4) {
-        return TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_INVALID_ARG,
-            (std::string("Attribute '") + attribute.name() + "' in '" +
-             onnx_node.op_type() + "' layer named '" + onnx_node.name() +
-             "' must have 4 values, got " +
-             std::to_string(attribute.ints().size()))
-                .c_str());
-      }
-      if ((attribute.ints(0) != attribute.ints(1)) ||
-          (attribute.ints(2) != attribute.ints(3))) {
-        return TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_INVALID_ARG,
-            (std::string("Attribute '") + attribute.name() + "' in '" +
-             onnx_node.op_type() + "' layer named '" + onnx_node.name() +
-             "' must the same value for the start and end padding of the same "
-             "axis")
-                .c_str());
-      }
-      padding_h = attribute.ints(0);
-      padding_w = attribute.ints(2);
-    } else if (attribute.name() == "strides") {
-      RETURN_IF_TYPE_MISMATCH(
-          onnx_node, attribute,
-          onnx::AttributeProto::AttributeType::
-              AttributeProto_AttributeType_INTS);
-      if (attribute.ints().size() != 2) {
-        return TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_INVALID_ARG,
-            (std::string("Attribute '") + attribute.name() + "' in '" +
-             onnx_node.op_type() + "' layer named '" + onnx_node.name() +
-             "' must have 2 values, got " +
-             std::to_string(attribute.ints().size()))
-                .c_str());
-      }
-      stride_h = attribute.ints(0);
-      stride_w = attribute.ints(1);
-    } else {
-      return TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_INVALID_ARG,
-          (std::string("Unknown attribute '") + attribute.name() + "' for '" +
-           onnx_node.op_type() + "' layer named '" + onnx_node.name() + "'")
-              .c_str());
-    }
-  }
-
-  // Input
-  auto input_it = parser->tensors_.find(onnx_node.input(0));
-  if (input_it == parser->tensors_.end()) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INVALID_ARG,
-        (std::string("Unable to find tensor '") + onnx_node.input(0) +
-         "' for '" + onnx_node.op_type() + "' layer named '" +
-         onnx_node.name() +
-         "', the tensor must be specified either as model input or as output "
-         "of layer that presedes this layer")
-            .c_str());
-  }
-  auto& input = input_it->second;
-  if (input->bounds.size() != 4) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INVALID_ARG,
-        (std::string("Input tensor '") + onnx_node.input(0) + "' for '" +
-         onnx_node.op_type() + "' layer named '" + onnx_node.name() +
-         "' must have shape (N, C, H, W)")
-            .c_str());
-  }
-  size_t in_channels = input->bounds[1];
-
-  // Weight (defer construction of the tensor, need to be owned by the layer)
-  auto weight_it = parser->weights_.find(onnx_node.input(1));
-  if (weight_it == parser->weights_.end()) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INVALID_ARG,
-        (std::string("Unable to find weight '") + onnx_node.input(0) +
-         "' for '" + onnx_node.op_type() + "' layer named '" +
-         onnx_node.name() +
-         "', the weight must be specified as initializer of the model")
-            .c_str());
-  }
-  const auto& weight_proto = weight_it->second;
-  DataType weight_dt;
-  RETURN_IF_ERROR(OnnxTypeToDataType(weight_proto->data_type(), &weight_dt));
-  std::vector<size_t> weight_dims;
-  for (const auto& dim : weight_proto->dims()) {
-    weight_dims.emplace_back(dim);
-  }
-  if ((weight_dims.size() != 4) || ((weight_dims[1] * groups) != in_channels)) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INVALID_ARG,
-        (std::string("Weight tensor '") + onnx_node.input(1) + "' for '" +
-         onnx_node.op_type() + "' layer named '" + onnx_node.name() +
-         "' must have shape (M, C/group, kH, kW)")
-            .c_str());
-  } else if (
-      ((kernel_h != 0) || (kernel_w != 0)) &&
-      ((kernel_h != weight_dims[2]) || (kernel_w != weight_dims[3]))) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INVALID_ARG,
-        (std::string("Weight tensor '") + onnx_node.input(1) + "' for '" +
-         onnx_node.op_type() + "' layer named '" + onnx_node.name() +
-         "' have different kernel shape than the shpae specified in layer "
-         "attribute")
-            .c_str());
-  }
-  size_t out_channels = weight_dims[0];
-  kernel_h = weight_dims[2];
-  kernel_w = weight_dims[3];
-
-  // Bias (defer construction of the tensor, need to be owned by the layer)
-  bool use_bias = (onnx_node.input().size() == 3);
-  auto bias_it = parser->weights_.find(onnx_node.input(2));
-  if (bias_it == parser->weights_.end()) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INVALID_ARG,
-        (std::string("Unable to find bias '") + onnx_node.input(0) + "' for '" +
-         onnx_node.op_type() + "' layer named '" + onnx_node.name() +
-         "', the bias must be specified as initializer of the model")
-            .c_str());
-  }
-  const auto& bias_proto = bias_it->second;
-  DataType bias_dt;
-  RETURN_IF_ERROR(OnnxTypeToDataType(bias_proto->data_type(), &bias_dt));
-  std::vector<size_t> bias_dims;
-  for (const auto& dim : bias_proto->dims()) {
-    bias_dims.emplace_back(dim);
-  }
-  if ((bias_dims.size() != 1) || (out_channels != bias_dims[0])) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INVALID_ARG,
-        (std::string("Bias tensor '") + onnx_node.input(1) + "' for '" +
-         onnx_node.op_type() + "' layer named '" + onnx_node.name() +
-         "' must have shape (M)")
-            .c_str());
-  }
-
-  // Construct layer
-  std::unique_ptr<Conv2D> conv2d_op(new Conv2D(
-      parser->model_, strategy, in_channels, out_channels, kernel_h, kernel_w,
-      stride_h, stride_w, padding_h, padding_w, ActivationMode::AC_MODE_NONE,
-      groups, use_bias, onnx_node.name().c_str()));
-  auto conv2d_op_ptr = conv2d_op.get();
-
-  // Finalize weight, bias, and output
-  std::unique_ptr<Weights> weight(
-      new Weights(conv2d_op.get(), weight_dt, weight_dims));
-
-  std::unique_ptr<Weights> bias(
-      use_bias ? new Weights(conv2d_op.get(), bias_dt, bias_dims) : nullptr);
-
-  // take floor of caluated result
-  size_t output_h =
-      (input->bounds[2] + 2 * padding_h - dilation_h * (kernel_h - 1) - 1) /
-          stride_h +
-      1;
-  size_t output_w =
-      (input->bounds[3] + 2 * padding_w - dilation_w * (kernel_w - 1) - 1) /
-          stride_w +
-      1;
-  std::unique_ptr<Tensor> output(new Tensor(
-      conv2d_op.get(), input->type,
-      {input->bounds[0], out_channels, output_h, output_w}));
-
-  conv2d_op->Configure(input.get(), weight.get(), output.get(), bias.get());
-
-  // Load weight after layer configured as the bound can be computed after that
-  RETURN_IF_ERROR(parser->LoadWeight<4>(
-      strategy,
-      [conv2d_op_ptr](Realm::Processor proc) {
-        return conv2d_op_ptr->GetWeightBounds(proc);
-      },
-      weight_proto, weight.get()));
-  if (bias != nullptr) {
-    RETURN_IF_ERROR(parser->LoadWeight<1>(
-        strategy,
-        [conv2d_op_ptr](Realm::Processor proc) {
-          return conv2d_op_ptr->GetBiasBounds(proc);
-        },
-        bias_proto, bias.get()));
-  }
-  // Weights are relased here as they are not placed in 'tensors_'
-  weight.release();
-  bias.release();
-
-  parser->tensors_.emplace(onnx_node.output(0), std::move(output));
-  parser->layers_->emplace_back(conv2d_op.release());
-  return nullptr;  // success
-}
-
-TRITONSERVER_Error*
-OnnxParser::ParseFlatten(
-    OnnxParser* parser, const LayerStrategy* strategy,
-    const onnx::NodeProto& onnx_node)
-{
-  // FIXME
-  std::cerr << onnx_node.DebugString() << std::endl;
-  return nullptr;  // success
-}
-
-TRITONSERVER_Error*
-OnnxParser::ParseAveragePool(
-    OnnxParser* parser, const LayerStrategy* strategy,
-    const onnx::NodeProto& onnx_node)
-{
-  // Layer attributes
-  size_t kernel_h = 0;
-  size_t kernel_w = 0;
-  size_t padding_h = 0;
-  size_t padding_w = 0;
-  size_t stride_h = 1;
-  size_t stride_w = 1;
-  for (const auto& attribute : onnx_node.attribute()) {
-    if (attribute.name() == "auto_pad") {
-      RETURN_IF_TYPE_MISMATCH(
-          onnx_node, attribute,
-          onnx::AttributeProto::AttributeType::
-              AttributeProto_AttributeType_STRING);
-      if (attribute.s() != "NOTSET") {
-        return TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_UNSUPPORTED,
-            (std::string("Unsupported attribute value '") + attribute.s() +
-             "' for attribute '" + attribute.name() + "' in '" +
-             onnx_node.op_type() + "' layer named '" + onnx_node.name() +
-             "', currently supported value is 'NOTSET'")
-                .c_str());
-      }
-    } else if (attribute.name() == "ceil_mode") {
-      RETURN_IF_TYPE_MISMATCH(
-          onnx_node, attribute,
-          onnx::AttributeProto::AttributeType::
-              AttributeProto_AttributeType_INT);
-      if (attribute.i() != 0) {
-        return TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_UNSUPPORTED,
-            (std::string("Unsupported attribute value for attribute '") +
-             attribute.name() + "' in '" + onnx_node.op_type() +
-             "' layer named '" + onnx_node.name() +
-             "', currently supported value is 0")
-                .c_str());
-      }
-    } else if (attribute.name() == "count_include_pad") {
-      RETURN_IF_TYPE_MISMATCH(
-          onnx_node, attribute,
-          onnx::AttributeProto::AttributeType::
-              AttributeProto_AttributeType_INT);
-      if (attribute.i() != 0) {
-        return TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_UNSUPPORTED,
-            (std::string("Unsupported attribute value for attribute '") +
-             attribute.name() + "' in '" + onnx_node.op_type() +
-             "' layer named '" + onnx_node.name() +
-             "', currently supported value is 0")
-                .c_str());
-      }
-    } else if (attribute.name() == "kernel_shape") {
-      RETURN_IF_TYPE_MISMATCH(
-          onnx_node, attribute,
-          onnx::AttributeProto::AttributeType::
-              AttributeProto_AttributeType_INTS);
-      if (attribute.ints().size() != 2) {
-        return TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_INVALID_ARG,
-            (std::string("Attribute '") + attribute.name() + "' in '" +
-             onnx_node.op_type() + "' layer named '" + onnx_node.name() +
-             "' must have 2 values, got " +
-             std::to_string(attribute.ints().size()))
-                .c_str());
-      }
-      kernel_h = attribute.ints(0);
-      kernel_w = attribute.ints(1);
-    } else if (attribute.name() == "pads") {
-      RETURN_IF_TYPE_MISMATCH(
-          onnx_node, attribute,
-          onnx::AttributeProto::AttributeType::
-              AttributeProto_AttributeType_INTS);
-      if (attribute.ints().size() != 4) {
-        return TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_INVALID_ARG,
-            (std::string("Attribute '") + attribute.name() + "' in '" +
-             onnx_node.op_type() + "' layer named '" + onnx_node.name() +
-             "' must have 4 values, got " +
-             std::to_string(attribute.ints().size()))
-                .c_str());
-      }
-      if ((attribute.ints(0) != attribute.ints(1)) ||
-          (attribute.ints(2) != attribute.ints(3))) {
-        return TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_INVALID_ARG,
-            (std::string("Attribute '") + attribute.name() + "' in '" +
-             onnx_node.op_type() + "' layer named '" + onnx_node.name() +
-             "' must the same value for the start and end padding of the same "
-             "axis")
-                .c_str());
-      }
-      padding_h = attribute.ints(0);
-      padding_w = attribute.ints(2);
-    } else if (attribute.name() == "strides") {
-      RETURN_IF_TYPE_MISMATCH(
-          onnx_node, attribute,
-          onnx::AttributeProto::AttributeType::
-              AttributeProto_AttributeType_INTS);
-      if (attribute.ints().size() != 2) {
-        return TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_INVALID_ARG,
-            (std::string("Attribute '") + attribute.name() + "' in '" +
-             onnx_node.op_type() + "' layer named '" + onnx_node.name() +
-             "' must have 2 values, got " +
-             std::to_string(attribute.ints().size()))
-                .c_str());
-      }
-      stride_h = attribute.ints(0);
-      stride_w = attribute.ints(1);
-    } else {
-      return TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_INVALID_ARG,
-          (std::string("Unknown attribute '") + attribute.name() + "' for '" +
-           onnx_node.op_type() + "' layer named '" + onnx_node.name() + "'")
-              .c_str());
-    }
-  }
-
-  // Input
-  auto input_it = parser->tensors_.find(onnx_node.input(0));
-  if (input_it == parser->tensors_.end()) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INVALID_ARG,
-        (std::string("Unable to find tensor '") + onnx_node.input(0) +
-         "' for '" + onnx_node.op_type() + "' layer named '" +
-         onnx_node.name() +
-         "', the tensor must be specified either as model input or as output "
-         "of layer that presedes this layer")
-            .c_str());
-  }
-  auto& input = input_it->second;
-  if (input->bounds.size() != 4) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INVALID_ARG,
-        (std::string("Input tensor '") + onnx_node.input(0) + "' for '" +
-         onnx_node.op_type() + "' layer named '" + onnx_node.name() +
-         "' must have shape (N, C, H, W)")
-            .c_str());
-  }
-
-  // Construct layer
-  std::unique_ptr<Pool2D> pool_op(new Pool2D(
-      parser->model_, strategy, kernel_h, kernel_w, stride_h, stride_w,
-      padding_h, padding_w, PoolType::POOL_AVG, ActivationMode::AC_MODE_NONE,
-      onnx_node.name().c_str()));
-
-  // Finalize output
-  size_t output_h =
-      (input->bounds[2] + 2 * padding_h - kernel_h) / stride_h + 1;
-  size_t output_w =
-      (input->bounds[3] + 2 * padding_w - kernel_w) / stride_w + 1;
-  std::unique_ptr<Tensor> output(new Tensor(
-      pool_op.get(), input->type,
-      {input->bounds[0], input->bounds[1], output_h, output_w}));
-
-  pool_op->Configure(input.get(), output.get());
-
-  parser->tensors_.emplace(onnx_node.output(0), std::move(output));
-  parser->layers_->emplace_back(pool_op.release());
-  return nullptr;  // success
-}
-
-TRITONSERVER_Error*
-OnnxParser::ParseMaxPool(
-    OnnxParser* parser, const LayerStrategy* strategy,
-    const onnx::NodeProto& onnx_node)
-{
-  // Layer attributes
-  size_t kernel_h = 0;
-  size_t kernel_w = 0;
-  size_t padding_h = 0;
-  size_t padding_w = 0;
-  size_t stride_h = 1;
-  size_t stride_w = 1;
-  size_t dilation_h = 1;
-  size_t dilation_w = 1;
-  for (const auto& attribute : onnx_node.attribute()) {
-    if (attribute.name() == "auto_pad") {
-      RETURN_IF_TYPE_MISMATCH(
-          onnx_node, attribute,
-          onnx::AttributeProto::AttributeType::
-              AttributeProto_AttributeType_STRING);
-      if (attribute.s() != "NOTSET") {
-        return TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_UNSUPPORTED,
-            (std::string("Unsupported attribute value '") + attribute.s() +
-             "' for attribute '" + attribute.name() + "' in '" +
-             onnx_node.op_type() + "' layer named '" + onnx_node.name() +
-             "', currently supported value is 'NOTSET'")
-                .c_str());
-      }
-    } else if (attribute.name() == "ceil_mode") {
-      RETURN_IF_TYPE_MISMATCH(
-          onnx_node, attribute,
-          onnx::AttributeProto::AttributeType::
-              AttributeProto_AttributeType_INT);
-      if (attribute.i() != 0) {
-        return TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_UNSUPPORTED,
-            (std::string("Unsupported attribute value for attribute '") +
-             attribute.name() + "' in '" + onnx_node.op_type() +
-             "' layer named '" + onnx_node.name() +
-             "', currently supported value is 0")
-                .c_str());
-      }
-    } else if (attribute.name() == "dilations") {
-      RETURN_IF_TYPE_MISMATCH(
-          onnx_node, attribute,
-          onnx::AttributeProto::AttributeType::
-              AttributeProto_AttributeType_INTS);
-      for (const auto dilation : attribute.ints()) {
-        if (dilation != 1) {
-          return TRITONSERVER_ErrorNew(
-              TRITONSERVER_ERROR_UNSUPPORTED,
-              (std::string("Unsupported attribute value for attribute '") +
-               attribute.name() + "' in '" + onnx_node.op_type() +
-               "' layer named '" + onnx_node.name() +
-               "', each of the attribute value must be 1")
-                  .c_str());
-        }
-      }
-    } else if (attribute.name() == "kernel_shape") {
-      RETURN_IF_TYPE_MISMATCH(
-          onnx_node, attribute,
-          onnx::AttributeProto::AttributeType::
-              AttributeProto_AttributeType_INTS);
-      if (attribute.ints().size() != 2) {
-        return TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_INVALID_ARG,
-            (std::string("Attribute '") + attribute.name() + "' in '" +
-             onnx_node.op_type() + "' layer named '" + onnx_node.name() +
-             "' must have 2 values, got " +
-             std::to_string(attribute.ints().size()))
-                .c_str());
-      }
-      kernel_h = attribute.ints(0);
-      kernel_w = attribute.ints(1);
-    } else if (attribute.name() == "pads") {
-      RETURN_IF_TYPE_MISMATCH(
-          onnx_node, attribute,
-          onnx::AttributeProto::AttributeType::
-              AttributeProto_AttributeType_INTS);
-      if (attribute.ints().size() != 4) {
-        return TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_INVALID_ARG,
-            (std::string("Attribute '") + attribute.name() + "' in '" +
-             onnx_node.op_type() + "' layer named '" + onnx_node.name() +
-             "' must have 4 values, got " +
-             std::to_string(attribute.ints().size()))
-                .c_str());
-      }
-      if ((attribute.ints(0) != attribute.ints(1)) ||
-          (attribute.ints(2) != attribute.ints(3))) {
-        return TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_INVALID_ARG,
-            (std::string("Attribute '") + attribute.name() + "' in '" +
-             onnx_node.op_type() + "' layer named '" + onnx_node.name() +
-             "' must the same value for the start and end padding of the same "
-             "axis")
-                .c_str());
-      }
-      padding_h = attribute.ints(0);
-      padding_w = attribute.ints(2);
-    } else if (attribute.name() == "storage_order") {
-      RETURN_IF_TYPE_MISMATCH(
-          onnx_node, attribute,
-          onnx::AttributeProto::AttributeType::
-              AttributeProto_AttributeType_INT);
-      if (attribute.i() != 0) {
-        return TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_UNSUPPORTED,
-            (std::string("Unsupported attribute value for attribute '") +
-             attribute.name() + "' in '" + onnx_node.op_type() +
-             "' layer named '" + onnx_node.name() +
-             "', currently supported value is 0")
-                .c_str());
-      }
-    } else if (attribute.name() == "strides") {
-      RETURN_IF_TYPE_MISMATCH(
-          onnx_node, attribute,
-          onnx::AttributeProto::AttributeType::
-              AttributeProto_AttributeType_INTS);
-      if (attribute.ints().size() != 2) {
-        return TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_INVALID_ARG,
-            (std::string("Attribute '") + attribute.name() + "' in '" +
-             onnx_node.op_type() + "' layer named '" + onnx_node.name() +
-             "' must have 2 values, got " +
-             std::to_string(attribute.ints().size()))
-                .c_str());
-      }
-      stride_h = attribute.ints(0);
-      stride_w = attribute.ints(1);
-    } else {
-      return TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_INVALID_ARG,
-          (std::string("Unknown attribute '") + attribute.name() + "' for '" +
-           onnx_node.op_type() + "' layer named '" + onnx_node.name() + "'")
-              .c_str());
-    }
-  }
-
-  // Input
-  auto input_it = parser->tensors_.find(onnx_node.input(0));
-  if (input_it == parser->tensors_.end()) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INVALID_ARG,
-        (std::string("Unable to find tensor '") + onnx_node.input(0) +
-         "' for '" + onnx_node.op_type() + "' layer named '" +
-         onnx_node.name() +
-         "', the tensor must be specified either as model input or as output "
-         "of layer that presedes this layer")
-            .c_str());
-  }
-  auto& input = input_it->second;
-  if (input->bounds.size() != 4) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INVALID_ARG,
-        (std::string("Input tensor '") + onnx_node.input(0) + "' for '" +
-         onnx_node.op_type() + "' layer named '" + onnx_node.name() +
-         "' must have shape (N, C, H, W)")
-            .c_str());
-  }
-
-  // Construct layer
-  std::unique_ptr<Pool2D> pool_op(new Pool2D(
-      parser->model_, strategy, kernel_h, kernel_w, stride_h, stride_w,
-      padding_h, padding_w, PoolType::POOL_MAX, ActivationMode::AC_MODE_NONE,
-      onnx_node.name().c_str()));
-
-  // Finalize output
-  if (onnx_node.output().size() != 1) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INVALID_ARG,
-        (std::string("Expect only 1 output for '") + onnx_node.op_type() +
-         "' layer named '" + onnx_node.name() + "'")
-            .c_str());
-  }
-  size_t output_h =
-      (input->bounds[2] + 2 * padding_h - dilation_h * (kernel_h - 1) - 1) /
-          stride_h +
-      1;
-  size_t output_w =
-      (input->bounds[3] + 2 * padding_w - dilation_w * (kernel_w - 1) - 1) /
-          stride_w +
-      1;
-
-  std::unique_ptr<Tensor> output(new Tensor(
-      pool_op.get(), input->type,
-      {input->bounds[0], input->bounds[1], output_h, output_w}));
-
-  pool_op->Configure(input.get(), output.get());
-
-  parser->tensors_.emplace(onnx_node.output(0), std::move(output));
-  parser->layers_->emplace_back(pool_op.release());
-  return nullptr;  // success
-}
-
-TRITONSERVER_Error*
-OnnxParser::ParseSoftmax(
-    OnnxParser* parser, const LayerStrategy* strategy,
-    const onnx::NodeProto& onnx_node)
-{
-  int axis = -1;
-
-  // Input
-  auto input_it = parser->tensors_.find(onnx_node.input(0));
-  if (input_it == parser->tensors_.end()) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INVALID_ARG,
-        (std::string("Unable to find tensor '") + onnx_node.input(0) +
-         "' for '" + onnx_node.op_type() + "' layer named '" +
-         onnx_node.name() +
-         "', the tensor must be specified either as model input or as "
-         "output "
-         "of layer that precedes this layer")
-            .c_str());
-  }
-  auto& input = input_it->second;
-
-  // Axis
-  for (const auto& attribute : onnx_node.attribute()) {
-    if (attribute.name() == "axis") {
-      RETURN_IF_TYPE_MISMATCH(
-          onnx_node, attribute,
-          onnx::AttributeProto::AttributeType::
-              AttributeProto_AttributeType_INT);
-      axis = attribute.i();
-      break;
-    }
-  }
-
-  if (axis < -1 * (int)input->bounds.size() ||
-      axis >= (int)input->bounds.size()) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INVALID_ARG,
-        (std::string(
-             "Attribute 'axis' in '" + onnx_node.op_type() + "' layer named '" +
-             onnx_node.name() +
-             "' must be between [-r, r-1] where r = rank(input), got " +
-             std::to_string(axis) + std::string(" with rank ") +
-             std::to_string(input->bounds.size()))
-             .c_str()));
-  }
-
-  if (axis <= -1)
-    axis = input->bounds.size() + axis;
-
-  std::unique_ptr<Softmax> softmax_op(
-      new Softmax(parser->model_, strategy, axis, onnx_node.name().c_str()));
-  std::unique_ptr<Tensor> output(
-      new Tensor(softmax_op.get(), input->type, input->bounds));
-  softmax_op->Configure(input.get(), output.get());
-
-  parser->tensors_.emplace(onnx_node.output(0), std::move(output));
-  parser->layers_->emplace_back(softmax_op.release());
-
-  return nullptr;  // success
-}
-
-TRITONSERVER_Error*
-OnnxParser::ParseRelu(
-    OnnxParser* parser, const LayerStrategy* strategy,
-    const onnx::NodeProto& onnx_node)
-{
-  auto input_it = parser->tensors_.find(onnx_node.input(0));
-  if (input_it == parser->tensors_.end()) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INVALID_ARG,
-        (std::string("Unable to find tensor '") + onnx_node.input(0) +
-         "' for '" + onnx_node.op_type() + "' layer named '" +
-         onnx_node.name() +
-         "', the tensor must be specified either as model input or as output "
-         "of layer that precedes this layer")
-            .c_str());
-  }
-  auto& input = input_it->second;
-
-  std::unique_ptr<UnaryOperator> unary_op(new UnaryOperator(
-      parser->model_, strategy, OperatorType::OP_RELU, nullptr, DT_NONE,
-      false /*inplace*/, onnx_node.name().c_str()));
-  std::unique_ptr<Tensor> output(
-      new Tensor(unary_op.get(), input->type, input->bounds));
-  unary_op->Configure(input.get(), output.get());
-
-  parser->tensors_.emplace(onnx_node.output(0), std::move(output));
-  parser->layers_->emplace_back(unary_op.release());
-  return nullptr;  // success
-}
-
-TRITONSERVER_Error*
-OnnxParser::ParseAdd(
-    OnnxParser* parser, const LayerStrategy* strategy,
-    const onnx::NodeProto& onnx_node)
-{
-  return parser->ParseBinary(strategy, onnx_node, OperatorType::OP_EW_ADD);
-}
-
-TRITONSERVER_Error*
-OnnxParser::ParseSub(
-    OnnxParser* parser, const LayerStrategy* strategy,
-    const onnx::NodeProto& onnx_node)
-{
-  return parser->ParseBinary(strategy, onnx_node, OperatorType::OP_EW_SUB);
-}
-
-TRITONSERVER_Error*
-OnnxParser::ParseMul(
-    OnnxParser* parser, const LayerStrategy* strategy,
-    const onnx::NodeProto& onnx_node)
-{
-  return parser->ParseBinary(strategy, onnx_node, OperatorType::OP_EW_MUL);
-}
-
-TRITONSERVER_Error*
-OnnxParser::ParseBinary(
-    const LayerStrategy* strategy, const onnx::NodeProto& onnx_node,
-    OperatorType op_type)
-{
-  if (onnx_node.input().size() != 2) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INVALID_ARG,
-        (onnx_node.op_type() + std::string("' layer named '") +
-         onnx_node.name() + std::string("' must have 2 values, got ") +
-         std::to_string(onnx_node.input().size()))
-            .c_str());
-  }
-
-  auto input_it0 = tensors_.find(onnx_node.input(0));
-  auto input_it1 = tensors_.find(onnx_node.input(1));
-  if (input_it0 == tensors_.end()) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INVALID_ARG,
-        (std::string("Unable to find tensor '") + onnx_node.input(0) +
-         "' for '" + onnx_node.op_type() + "' layer named '" +
-         onnx_node.name() +
-         "', the tensor must be specified either as model input or as output "
-         "of layer that precedes this layer")
-            .c_str());
-  }
-  if (input_it1 == tensors_.end()) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INVALID_ARG,
-        (std::string("Unable to find tensor '") + onnx_node.input(1) +
-         "' for '" + onnx_node.op_type() + "' layer named '" +
-         onnx_node.name() +
-         "', the tensor must be specified either as model input or as output "
-         "of layer that precedes this layer")
-            .c_str());
-  }
-  auto& input0 = input_it0->second;
-  auto& input1 = input_it1->second;
-
-  // Error checking
-  if (input0->type != input1->type) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INVALID_ARG,
-        (std::string("Non-matching input types: ") +
-         std::to_string(input0->type) + std::string(" and ") +
-         std::to_string(input1->type))
-            .c_str());
-  }
-
-  // [gluo FIXME] broadcasting not currently supported
-  if (input0->bounds != input1->bounds) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INVALID_ARG,
-        (std::string("Non-matching input bounds: ") +
-         std::to_string(input0->bounds.size()) + std::string(" and ") +
-         std::to_string(input1->bounds.size()))
-            .c_str());
-  }
-
-  std::unique_ptr<BinaryOperator> binary_op(new BinaryOperator(
-      model_, strategy, op_type, 0, onnx_node.name().c_str()));
-  std::unique_ptr<Tensor> output(
-      new Tensor(binary_op.get(), input1->type, input1->bounds));
-  binary_op->Configure(input0.get(), input1.get(), output.get());
-
-  tensors_.emplace(onnx_node.output(0), std::move(output));
-  layers_->emplace_back(binary_op.release());
-
-  return nullptr;  // success
-}
-
-TRITONSERVER_Error*
-OnnxParser::ParseIdentity(
-    OnnxParser* parser, const LayerStrategy* strategy,
-    const onnx::NodeProto& onnx_node)
-{
-  if (onnx_node.input().size() != 1) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INVALID_ARG,
-        (onnx_node.op_type() + std::string("' layer named '") +
-         onnx_node.name() + std::string("' must have 1 input, got ") +
-         std::to_string(onnx_node.input().size()))
-            .c_str());
-  }
-
-  auto input_it = parser->tensors_.find(onnx_node.input(0));
-  if (input_it == parser->tensors_.end()) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INVALID_ARG,
-        (std::string("Unable to find tensor '") + onnx_node.input(0) +
-         "' for '" + onnx_node.op_type() + "' layer named '" +
-         onnx_node.name() +
-         "', the tensor must be specified either as model input or as output "
-         "of layer that precedes this layer")
-            .c_str());
-  }
-  auto& input = input_it->second;
-
-  // Identity doesn't use scalar, so set a large enough buffer with zeros for
-  // scalar value
-  uint64_t scalar_value = 0;
-  std::unique_ptr<UnaryOperator> op(new UnaryOperator(
-      parser->model_, strategy, OperatorType::OP_IDENTITY, &scalar_value,
-      input->type, false /* inplace */, onnx_node.name().c_str()));
-  std::unique_ptr<Tensor> output(
-      new Tensor(op.get(), input->type, input->bounds));
-  op->Configure(input.get(), output.get());
-
-  parser->tensors_.emplace(onnx_node.output(0), std::move(output));
-  parser->layers_->emplace_back(op.release());
-
-  return nullptr;  // success
-}
-
-TRITONSERVER_Error*
-OnnxParser::ParseCast(
-    OnnxParser* parser, const LayerStrategy* strategy,
-    const onnx::NodeProto& onnx_node)
-{
-  if (onnx_node.input().size() != 1) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INVALID_ARG,
-        (onnx_node.op_type() + std::string("' layer named '") +
-         onnx_node.name() + std::string("' must have 1 input, got ") +
-         std::to_string(onnx_node.input().size()))
-            .c_str());
-  }
-
-  auto input_it = parser->tensors_.find(onnx_node.input(0));
-  if (input_it == parser->tensors_.end()) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INVALID_ARG,
-        (std::string("Unable to find tensor '") + onnx_node.input(0) +
-         "' for '" + onnx_node.op_type() + "' layer named '" +
-         onnx_node.name() +
-         "', the tensor must be specified either as model input or as output "
-         "of layer that precedes this layer")
-            .c_str());
-  }
-  auto& input = input_it->second;
-  DataType new_type;
-
-  for (const auto& attribute : onnx_node.attribute()) {
-    if (attribute.name() == "to") {
-      RETURN_IF_TYPE_MISMATCH(
-          onnx_node, attribute,
-          onnx::AttributeProto::AttributeType::
-              AttributeProto_AttributeType_INT);
-      OnnxTypeToDataType(attribute.i(), &new_type);
-      break;
-    }
-  }
-
-  // Cast doesn't use scalar, so set a large enough buffer with zeros for
-  // scalar value
-  uint64_t scalar_value = 0;
-  std::unique_ptr<UnaryOperator> op(new UnaryOperator(
-      parser->model_, strategy, OperatorType::OP_CAST, &scalar_value,
-      input->type, false /* inplace */, onnx_node.name().c_str()));
-  std::unique_ptr<Tensor> output(new Tensor(op.get(), new_type, input->bounds));
-  op->Configure(input.get(), output.get());
-
-  parser->tensors_.emplace(onnx_node.output(0), std::move(output));
-  parser->layers_->emplace_back(op.release());
-
-  return nullptr;  // success
-}
-
-TRITONSERVER_Error*
-OnnxParser::ParseTanh(
-    OnnxParser* parser, const LayerStrategy* strategy,
-    const onnx::NodeProto& onnx_node)
-{
-  if (onnx_node.input().size() != 1) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INVALID_ARG,
-        (onnx_node.op_type() + std::string("' layer named '") +
-         onnx_node.name() + std::string("' must have 1 input, got ") +
-         std::to_string(onnx_node.input().size()))
-            .c_str());
-  }
-
-  auto input_it = parser->tensors_.find(onnx_node.input(0));
-  if (input_it == parser->tensors_.end()) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INVALID_ARG,
-        (std::string("Unable to find tensor '") + onnx_node.input(0) +
-         "' for '" + onnx_node.op_type() + "' layer named '" +
-         onnx_node.name() +
-         "', the tensor must be specified either as model input or as output "
-         "of layer that precedes this layer")
-            .c_str());
-  }
-  auto& input = input_it->second;
-
-  // Tanh doesn't use scalar, so set a large enough buffer with zeros for
-  // scalar value
-  uint64_t scalar_value = 0;
-  std::unique_ptr<UnaryOperator> op(new UnaryOperator(
-      parser->model_, strategy, OperatorType::OP_TANH, &scalar_value,
-      input->type, false /* inplace */, onnx_node.name().c_str()));
-  std::unique_ptr<Tensor> output(
-      new Tensor(op.get(), input->type, input->bounds));
-  op->Configure(input.get(), output.get());
-
-  parser->tensors_.emplace(onnx_node.output(0), std::move(output));
-  parser->layers_->emplace_back(op.release());
-
-  return nullptr;  // success
-}
-
-TRITONSERVER_Error*
-OnnxParser::ParseReciprocal(
-    OnnxParser* parser, const LayerStrategy* strategy,
-    const onnx::NodeProto& onnx_node)
-{
-  if (onnx_node.input().size() != 1) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INVALID_ARG,
-        (onnx_node.op_type() + std::string("' layer named '") +
-         onnx_node.name() + std::string("' must have 1 input, got ") +
-         std::to_string(onnx_node.input().size()))
-            .c_str());
-  }
-
-  auto input_it = parser->tensors_.find(onnx_node.input(0));
-  if (input_it == parser->tensors_.end()) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INVALID_ARG,
-        (std::string("Unable to find tensor '") + onnx_node.input(0) +
-         "' for '" + onnx_node.op_type() + "' layer named '" +
-         onnx_node.name() +
-         "', the tensor must be specified either as model input or as output "
-         "of layer that precedes this layer")
-            .c_str());
-  }
-  auto& input = input_it->second;
-
-  // Reciprocal doesn't use scalar, so set a large enough buffer with zeros for
-  // scalar value
-  uint64_t scalar_value = 0;
-  std::unique_ptr<UnaryOperator> op(new UnaryOperator(
-      parser->model_, strategy, OperatorType::OP_RECIPROCAL, &scalar_value,
-      input->type, false /* inplace */, onnx_node.name().c_str()));
-  std::unique_ptr<Tensor> output(
-      new Tensor(op.get(), input->type, input->bounds));
-  op->Configure(input.get(), output.get());
-
-  parser->tensors_.emplace(onnx_node.output(0), std::move(output));
-  parser->layers_->emplace_back(op.release());
-
-  return nullptr;  // success
-}
-
-TRITONSERVER_Error*
-OnnxParser::ParseSqrt(
-    OnnxParser* parser, const LayerStrategy* strategy,
-    const onnx::NodeProto& onnx_node)
-{
-  if (onnx_node.input().size() != 1) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INVALID_ARG,
-        (onnx_node.op_type() + std::string("' layer named '") +
-         onnx_node.name() + std::string("' must have 1 input, got ") +
-         std::to_string(onnx_node.input().size()))
-            .c_str());
-  }
-
-  auto input_it = parser->tensors_.find(onnx_node.input(0));
-  if (input_it == parser->tensors_.end()) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INVALID_ARG,
-        (std::string("Unable to find tensor '") + onnx_node.input(0) +
-         "' for '" + onnx_node.op_type() + "' layer named '" +
-         onnx_node.name() +
-         "', the tensor must be specified either as model input or as output "
-         "of layer that precedes this layer")
-            .c_str());
-  }
-  auto& input = input_it->second;
-
-  // Sqrt doesn't use scalar, so set a large enough buffer with zeros for
-  // scalar value
-  uint64_t scalar_value = 0;
-  std::unique_ptr<UnaryOperator> op(new UnaryOperator(
-      parser->model_, strategy, OperatorType::OP_SQRT, &scalar_value,
-      input->type, false /* inplace */, onnx_node.name().c_str()));
-  std::unique_ptr<Tensor> output(
-      new Tensor(op.get(), input->type, input->bounds));
-  op->Configure(input.get(), output.get());
-
-  parser->tensors_.emplace(onnx_node.output(0), std::move(output));
-  parser->layers_->emplace_back(op.release());
-
-  return nullptr;  // success
-}
-
-}}}  // namespace triton::backend::legion
diff --git a/triton/src/onnx_parser.h b/triton/src/onnx_parser.h
deleted file mode 100755
index d6892f7a07..0000000000
--- a/triton/src/onnx_parser.h
+++ /dev/null
@@ -1,126 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <string>
-#include <vector>
-#include "model.h"
-#include "onnx/onnx-ml.pb.h"
-#include "operator.h"
-#include "strategy.h"
-#include "triton/core/tritonserver.h"
-#include "types.h"
-
-namespace triton { namespace backend { namespace legion {
-
-class OnnxParser {
- public:
-  static TRITONSERVER_Error* LoadModel(
-      std::function<
-          const std::vector<Realm::Processor>&(Realm::Processor::Kind)>
-          find_local_processor_fn,
-      LegionModelState* model, const PartitionStrategy* strategy,
-      const std::string& onnx_file,
-      std::vector<std::pair<std::string, Tensor*>>* inputs,
-      std::vector<std::pair<std::string, Tensor*>>* outputs,
-      std::vector<Operator*>* layers);
-  OnnxParser(
-      std::function<
-          const std::vector<Realm::Processor>&(Realm::Processor::Kind)>
-          find_local_processor_fn,
-      LegionModelState* model, const PartitionStrategy* strategy,
-      const onnx::ModelProto& onnx_model,
-      std::vector<std::pair<std::string, Tensor*>>* inputs,
-      std::vector<std::pair<std::string, Tensor*>>* outputs,
-      std::vector<Operator*>* layers);
-  ~OnnxParser();
-
- private:
-  static TRITONSERVER_Error* ParseConv2D(
-      OnnxParser* parser, const LayerStrategy* strategy,
-      const onnx::NodeProto& onnx_node);
-  static TRITONSERVER_Error* ParseFlatten(
-      OnnxParser* parser, const LayerStrategy* strategy,
-      const onnx::NodeProto& onnx_node);
-  static TRITONSERVER_Error* ParseAveragePool(
-      OnnxParser* parser, const LayerStrategy* strategy,
-      const onnx::NodeProto& onnx_node);
-  static TRITONSERVER_Error* ParseMaxPool(
-      OnnxParser* parser, const LayerStrategy* strategy,
-      const onnx::NodeProto& onnx_node);
-  static TRITONSERVER_Error* ParseSoftmax(
-      OnnxParser* parser, const LayerStrategy* strategy,
-      const onnx::NodeProto& onnx_node);
-  static TRITONSERVER_Error* ParseRelu(
-      OnnxParser* parser, const LayerStrategy* strategy,
-      const onnx::NodeProto& onnx_node);
-  static TRITONSERVER_Error* ParseAdd(
-      OnnxParser* parser, const LayerStrategy* strategy,
-      const onnx::NodeProto& onnx_node);
-  static TRITONSERVER_Error* ParseSub(
-      OnnxParser* parser, const LayerStrategy* strategy,
-      const onnx::NodeProto& onnx_node);
-  static TRITONSERVER_Error* ParseMul(
-      OnnxParser* parser, const LayerStrategy* strategy,
-      const onnx::NodeProto& onnx_node);
-  static TRITONSERVER_Error* ParseIdentity(
-      OnnxParser* parser, const LayerStrategy* strategy,
-      const onnx::NodeProto& onnx_node);
-  static TRITONSERVER_Error* ParseCast(
-      OnnxParser* parser, const LayerStrategy* strategy,
-      const onnx::NodeProto& onnx_node);
-  static TRITONSERVER_Error* ParseTanh(
-      OnnxParser* parser, const LayerStrategy* strategy,
-      const onnx::NodeProto& onnx_node);
-  static TRITONSERVER_Error* ParseReciprocal(
-      OnnxParser* parser, const LayerStrategy* strategy,
-      const onnx::NodeProto& onnx_node);
-  static TRITONSERVER_Error* ParseSqrt(
-      OnnxParser* parser, const LayerStrategy* strategy,
-      const onnx::NodeProto& onnx_node);
-
-  TRITONSERVER_Error* ParseInput(const onnx::GraphProto& onnx_graph);
-  TRITONSERVER_Error* ParseWeight(const onnx::GraphProto& onnx_graph);
-  TRITONSERVER_Error* ParseOutput(const onnx::GraphProto& onnx_graph);
-
-  TRITONSERVER_Error* ParseBinary(
-      const LayerStrategy* strategy, const onnx::NodeProto& onnx_node,
-      OperatorType op_type);
-
-  template <int Dim>
-  TRITONSERVER_Error* LoadWeight(
-      const LayerStrategy* strategy,
-      std::function<Legion::Rect<Dim>(Realm::Processor)> local_bound_fn,
-      const onnx::TensorProto* weight_proto, Weights* weight);
-  TRITONSERVER_Error* SetElementData(
-      const std::vector<size_t>& strides, const Legion::Domain& local_bounds,
-      const size_t* local_strides, size_t dim_idx, const bool is_raw_boolean,
-      const char* src_data, char* dst_data);
-
-  using ParseFn_t = std::function<TRITONSERVER_Error*(
-      OnnxParser*, const LayerStrategy*, const onnx::NodeProto&)>;
-  static std::map<std::string, ParseFn_t> op_type_parser_map_;
-  std::function<const std::vector<Realm::Processor>&(Realm::Processor::Kind)>
-      find_local_processor_fn_;
-  LegionModelState* const model_;
-  const PartitionStrategy* strategy_;
-  const onnx::ModelProto& onnx_model_;
-  std::vector<std::pair<std::string, Tensor*>>* inputs_;
-  std::vector<std::pair<std::string, Tensor*>>* outputs_;
-  std::vector<Operator*>* layers_;
-  std::map<std::string, std::unique_ptr<Tensor>> tensors_;
-  std::map<std::string, const onnx::TensorProto*> weights_;
-};
-
-}}}  // namespace triton::backend::legion
diff --git a/triton/src/operator.cc b/triton/src/operator.cc
deleted file mode 100644
index f51cda6aea..0000000000
--- a/triton/src/operator.cc
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "operator.h"
-#include "operators/binary.h"
-#include "operators/concat.h"
-#include "operators/conv2d.h"
-#include "operators/matmul.h"
-#include "operators/reshape.h"
-#include "operators/softmax.h"
-#include "operators/unary.h"
-#include "tensor.h"
-
-namespace triton { namespace backend { namespace legion {
-
-Operator::Operator(
-    LegionModelState* m, const LayerStrategy* s, OperatorType t,
-    const char* name, unsigned in, unsigned wts, unsigned out)
-    : op_type(t), op_name(name), model(m), strategy(s), num_inputs(in),
-      num_weights(wts), num_outputs(out)
-{
-}
-
-Operator::~Operator(void)
-{
-  // Delete all the weight and output tensors
-  for (auto wts : weights) delete wts;
-  for (auto tensor : outputs) delete tensor;
-}
-
-/*static*/ void
-Operator::PreregisterTaskVariants(void)
-{
-  BinaryOperator::PreregisterTaskVariants();
-  Concat::PreregisterTaskVariants();
-  Conv2D::PreregisterTaskVariants();
-  MatMul::PreregisterTaskVariants();
-  Reshape::PreregisterTaskVariants();
-  Softmax::PreregisterTaskVariants();
-  UnaryOperator::PreregisterTaskVariants();
-}
-
-}}}  // namespace triton::backend::legion
diff --git a/triton/src/operator.h b/triton/src/operator.h
deleted file mode 100644
index dba58a7129..0000000000
--- a/triton/src/operator.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __LEGION_TRITON_OPERATOR_H__
-#define __LEGION_TRITON_OPERATOR_H__
-
-#include "accessor.h"
-#include "config.h"
-#include "instance.h"
-#include "legion.h"
-#include "model.h"
-#include "strategy.h"
-#include "types.h"
-
-namespace triton { namespace backend { namespace legion {
-
-class Operator;
-
-struct OperatorArgs {
- public:
-  OperatorArgs(bool prof = false) : profiling(prof), owner(nullptr) {}
-
- public:
-  bool profiling;
-  Operator* owner;  // technically not legion safe, debugging/profiling only
-#if 0
-  cudnnHandle_t dnn;
-  cublasHandle_t blas;
-  bool allowTensorOpMathConversion;
-#ifdef USE_NCCL
-  ncclComm_t ncclComm;
-#endif
-#endif
-};
-
-class Operator {
- public:
-  Operator(
-      LegionModelState* model, const LayerStrategy* strategy, OperatorType type,
-      const char* name, unsigned num_inputs, unsigned num_weights,
-      unsigned num_outputs);
-  virtual ~Operator(void);
-
- public:
-  // Called by model load (Realm)
-  virtual void Load(Realm::Processor processor) = 0;
-  // Called per instance (Legion)
-  virtual void initialize(
-      LegionModelInstance* instance, const unsigned instance_index,
-      Legion::Runtime* runtime, Legion::Context ctx,
-      Legion::MapperID mapper) = 0;
-  virtual void forward(
-      LegionModelInstance* instance, const unsigned instance_index,
-      Legion::Runtime* runtime, Legion::Context ctx,
-      Legion::MapperID mapper) = 0;
-  virtual void finalize(
-      LegionModelInstance* instance, const unsigned instance_index,
-      Legion::Runtime* runtime, Legion::Context ctx,
-      Legion::MapperID mapper) = 0;
-  // Called by model free (Realm)
-  virtual void Free(Realm::Processor processor) = 0;
-
- public:
-  static void PreregisterTaskVariants(void);
-
- public:
-  const OperatorType op_type;
-  const std::string op_name;
-  LegionModelState* const model;
-  const LayerStrategy* const strategy;
-  const unsigned num_inputs;
-  const unsigned num_weights;
-  const unsigned num_outputs;
-
- protected:
-  Legion::IndexSpace launch_space[MAX_NUM_INSTANCES];
-  std::vector<Tensor*> inputs;
-  std::vector<Tensor*> outputs;
-  std::vector<Weights*> weights;
-};
-
-}}}  // namespace triton::backend::legion
-
-#endif  // __LEGION_TRITON_OPERATOR_H__
diff --git a/triton/src/operators/binary.cc b/triton/src/operators/binary.cc
deleted file mode 100644
index c04b3d3d24..0000000000
--- a/triton/src/operators/binary.cc
+++ /dev/null
@@ -1,363 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "binary.h"
-
-using namespace Legion;
-
-namespace triton { namespace backend { namespace legion {
-
-BinaryOperator::BinaryOperator(
-    LegionModelState* model, const LayerStrategy* strategy, OperatorType type,
-    bool inplace, const char* name)
-    : Operator(model, strategy, type, name, 2, 0, 1), inplace(inplace)
-{
-}
-
-void
-BinaryOperator::Configure(Tensor* input0, Tensor* input1, Tensor* output)
-{
-  assert(input0 != nullptr);
-  assert(input1 != nullptr);
-  assert(output != nullptr);
-  assert(input0->type == input1->type);
-  assert(input0->type == output->type);
-  // inplace can only be set to true in restricted op type (refer to FlexFlow)
-  assert(
-      !inplace ||
-      ((input0 == output) && ((op_type == OperatorType::OP_EW_ADD) ||
-                              (op_type == OperatorType::OP_EW_MUL))));
-  // Make sure that they have the same bounds.
-  // Broadcasting is currently not supported
-  assert(input0->bounds.size() == input1->bounds.size());
-  assert(input0->bounds.size() == output->bounds.size());
-  for (unsigned idx = 0; idx < input0->bounds.size(); idx++) {
-    assert(input0->bounds[idx] == input1->bounds[idx]);
-    assert(input0->bounds[idx] == output->bounds[idx]);
-  }
-  inputs.push_back(input0);
-  inputs.push_back(input1);
-  outputs.push_back(output);
-}
-
-Domain
-BinaryOperator::GetBounds(Processor proc)
-{
-  const size_t dims = outputs[0]->bounds.size();
-  DomainPoint lo, hi;
-  lo.dim = dims;
-  hi.dim = dims;
-  for (int d = 0; d < dims; d++) {
-    lo[d] = 0;
-    hi[d] = outputs[0]->bounds[d] - 1;
-  }
-  const Domain global(lo, hi);
-  return strategy->find_local_domain(proc, global);
-}
-
-void
-BinaryOperator::Load(Realm::Processor proc)
-{
-  assert(proc.kind() == strategy->kind);
-  // If this processor is not used for this layer there is nothing to do
-  if (!strategy->is_local_processor(proc))
-    return;
-  const unsigned local_index = strategy->find_local_offset(proc);
-  BinaryArgs& proc_args = args[local_index];
-  proc_args.owner = this;
-  proc_args.op_type = op_type;
-  proc_args.bounds = GetBounds(proc);
-  proc_args.datatype = outputs[0]->type;
-  proc_args.inplace = inplace;
-#ifdef LEGION_USE_CUDA
-  if (proc.kind() == Processor::TOC_PROC) {
-    if (use_cudnn(op_type, proc_args.datatype)) {
-      proc_args.cudnn = model->runtime_->cudnn[local_index];
-      CHECK_CUDNN(cudnnCreateTensorDescriptor(&proc_args.input0Tensor));
-      CHECK_CUDNN(cudnnCreateTensorDescriptor(&proc_args.input1Tensor));
-      CHECK_CUDNN(cudnnCreateTensorDescriptor(&proc_args.outputTensor));
-      CHECK_CUDNN(cudnnCreateOpTensorDescriptor(&proc_args.opDesc));
-      cudnnOpTensorOp_t mode;
-      switch (op_type) {
-        case OperatorType::OP_EW_ADD:
-        case OperatorType::OP_EW_SUB: {
-          mode = CUDNN_OP_TENSOR_ADD;
-          break;
-        }
-        case OperatorType::OP_EW_MUL: {
-          mode = CUDNN_OP_TENSOR_MUL;
-          break;
-        }
-        default:
-          abort();
-      }
-      cudnnDataType_t type = to_op_tensor_comp_type(
-          proc_args.datatype, proc_args.datatype, proc_args.datatype);
-      CHECK_CUDNN(cudnnSetOpTensorDescriptor(
-          proc_args.opDesc, mode, type, CUDNN_PROPAGATE_NAN));
-      CHECK_CUDNN(cudnnSetTensorDescriptorFromDomain(
-          proc_args.input0Tensor, proc_args.bounds, inputs[0]->type));
-      CHECK_CUDNN(cudnnSetTensorDescriptorFromDomain(
-          proc_args.input1Tensor, proc_args.bounds, inputs[1]->type));
-      CHECK_CUDNN(cudnnSetTensorDescriptorFromDomain(
-          proc_args.outputTensor, proc_args.bounds, outputs[0]->type));
-    }
-  }
-#endif
-}
-
-void
-BinaryOperator::initialize(
-    LegionModelInstance* instance, const unsigned instance_index,
-    Legion::Runtime* runtime, Legion::Context ctx, Legion::MapperID mapper)
-{
-  const Domain launch_domain = strategy->get_launch_domain();
-  // Find or create the launch space domain
-  IndexSpace launch_space = instance->find_or_create_index_space(launch_domain);
-  // Also get the sharding function from the strategy
-  ShardingFunction* shardfn = strategy->sharding_function;
-  // Construct a future map for the pass-by-value arguments
-  std::map<DomainPoint, TaskArgument> values;
-  for (Domain::DomainPointIterator itr(launch_domain); itr; itr++) {
-    const Processor proc = shardfn->find_proc(itr.p, launch_domain);
-    if (!strategy->is_local_processor(proc))
-      continue;
-    const unsigned local_index = strategy->find_local_offset(proc);
-    values[itr.p] = TaskArgument(args + local_index, sizeof(BinaryArgs));
-  }
-  argmaps[instance_index] = runtime->construct_future_map(
-      ctx, launch_space, values, true /*collective*/, shardfn->sharding_id);
-
-  IndexTaskLauncher& launcher = launchers[instance_index];
-  launcher = IndexTaskLauncher(
-      BINARY_TASK_ID, launch_space, TaskArgument(NULL, 0),
-      ArgumentMap(argmaps[instance_index]), Predicate::TRUE_PRED,
-      false /*must*/, mapper, strategy->tag);
-  LogicalRegion input0_region = inputs[0]->region[instance_index];
-  LogicalRegion input1_region = inputs[1]->region[instance_index];
-  LogicalPartition input0_part =
-      instance->find_or_create_tiled_partition(inputs[0], strategy);
-  LogicalPartition input1_part =
-      instance->find_or_create_tiled_partition(inputs[1], strategy);
-  if (inplace) {
-    launcher.add_region_requirement(RegionRequirement(
-        input0_part, 0 /*projection id*/, LEGION_READ_WRITE, LEGION_EXCLUSIVE,
-        input0_region));
-    launcher.add_field(0, FID_DATA);
-    launcher.add_region_requirement(RegionRequirement(
-        input1_part, 0 /*projection id*/, LEGION_READ_ONLY, LEGION_EXCLUSIVE,
-        input1_region));
-    launcher.add_field(1, FID_DATA);
-  } else {
-    // Create a logical region for the output data
-    assert(outputs.size() == 1);
-    LogicalRegion output_region = instance->create_tensor_region(outputs[0]);
-    // Create partitions for the regions
-    LogicalPartition output_part =
-        instance->find_or_create_tiled_partition(outputs[0], strategy);
-
-    launcher.add_region_requirement(RegionRequirement(
-        output_part, 0 /*projection id*/, LEGION_WRITE_DISCARD,
-        LEGION_EXCLUSIVE, output_region));
-    launcher.add_field(0, FID_DATA);
-    launcher.add_region_requirement(RegionRequirement(
-        input0_part, 0 /*projection id*/, LEGION_READ_ONLY, LEGION_EXCLUSIVE,
-        input0_region));
-    launcher.add_field(1, FID_DATA);
-    launcher.add_region_requirement(RegionRequirement(
-        input1_part, 0 /*projection id*/, LEGION_READ_ONLY, LEGION_EXCLUSIVE,
-        input1_region));
-    launcher.add_field(2, FID_DATA);
-  }
-}
-
-void
-BinaryOperator::forward(
-    LegionModelInstance* instance, const unsigned instance_index,
-    Legion::Runtime* runtime, Legion::Context ctx, Legion::MapperID mapper)
-{
-  runtime->execute_index_space(ctx, launchers[instance_index]);
-}
-
-void
-BinaryOperator::finalize(
-    LegionModelInstance* instance, const unsigned instance_index,
-    Legion::Runtime* runtime, Legion::Context ctx, Legion::MapperID mapper)
-{
-  argmaps[instance_index] = FutureMap();
-}
-
-void
-BinaryOperator::Free(Realm::Processor proc)
-{
-  assert(proc.kind() == strategy->kind);
-  // If this processor is not used for this layer there is nothing to do
-  if (!strategy->is_local_processor(proc))
-    return;
-#ifdef LEGION_USE_CUDA
-  if ((proc.kind() == Processor::TOC_PROC) &&
-      use_cudnn(op_type, outputs[0]->type)) {
-    const unsigned local_index = strategy->find_local_offset(proc);
-    BinaryArgs& proc_args = args[local_index];
-    CHECK_CUDNN(cudnnDestroyTensorDescriptor(proc_args.input0Tensor));
-    CHECK_CUDNN(cudnnDestroyTensorDescriptor(proc_args.input1Tensor));
-    CHECK_CUDNN(cudnnDestroyTensorDescriptor(proc_args.outputTensor));
-    CHECK_CUDNN(cudnnDestroyOpTensorDescriptor(proc_args.opDesc));
-  }
-#endif
-}
-
-/*static*/ void
-BinaryOperator::PreregisterTaskVariants(void)
-{
-  {
-    TaskVariantRegistrar cpu_registrar(BINARY_TASK_ID, "Binary CPU");
-    cpu_registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    cpu_registrar.set_leaf();
-    Runtime::preregister_task_variant<forward_cpu>(
-        cpu_registrar, "Binary Operator");
-  }
-#ifdef LEGION_USE_CUDA
-  {
-    TaskVariantRegistrar gpu_registrar(BINARY_TASK_ID, "Binary GPU");
-    gpu_registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
-    gpu_registrar.set_leaf();
-    Runtime::preregister_task_variant<forward_gpu>(
-        gpu_registrar, "Binary Operator");
-  }
-#endif
-}
-
-/*static*/ void
-BinaryOperator::forward_cpu(
-    const Task* task, const std::vector<PhysicalRegion>& regions, Context ctx,
-    Runtime* runtime)
-{
-  // TODO: implement this
-  assert(false);
-}
-
-#ifdef LEGION_USE_CUDA
-/*static*/ void
-BinaryOperator::forward_gpu(
-    const Legion::Task* task,
-    const std::vector<Legion::PhysicalRegion>& regions, Legion::Context ctx,
-    Legion::Runtime* runtime)
-{
-  assert(task->local_arglen == sizeof(BinaryArgs));
-  const BinaryArgs* args = (const BinaryArgs*)task->local_args;
-#ifndef DISABLE_LEGION_CUDA_HIJACK
-  ::cudaStream_t stream;
-  CHECK_CUDA(cudaStreamCreate(&stream));
-  if (use_cudnn(args->op_type, args->datatype)) {
-    CHECK_CUDNN(cudnnSetStream(args->cudnn, stream));
-  }
-#endif
-  ::cudaEvent_t t_start, t_end;
-  if (args->profiling) {
-    CHECK_CUDA(cudaEventCreate(&t_start));
-    CHECK_CUDA(cudaEventCreate(&t_end));
-#ifdef DISABLE_LEGION_CUDA_HIJACK
-    CHECK_CUDA(cudaEventRecord(t_start));
-#else
-    CHECK_CUDA(cudaEventRecord(t_start, stream));
-#endif
-  }
-  if (args->inplace) {
-    assert(regions.size() == 2);
-    assert(task->regions.size() == 2);
-    void* inout_ptr = nullptr;
-    const void* input1_ptr = nullptr;
-    size_t volume = 0;
-    switch (args->bounds.get_dim()) {
-#define DIMFUNC(DIM)                                            \
-  case DIM: {                                                   \
-    const Rect<DIM> bounds = args->bounds;                      \
-    volume = bounds.volume();                                   \
-    inout_ptr = TensorAccessor<LEGION_READ_WRITE, DIM>::access( \
-        args->datatype, bounds, regions[0]);                    \
-    input1_ptr = TensorAccessor<LEGION_READ_ONLY, DIM>::access( \
-        args->datatype, bounds, regions[1]);                    \
-    break;                                                      \
-  }
-      LEGION_FOREACH_N(DIMFUNC)
-#undef DIMFUNC
-      default:
-        abort();
-    }
-    forward_kernel(args, stream, inout_ptr, input1_ptr, inout_ptr, volume);
-  } else {
-    assert(regions.size() == 3);
-    assert(task->regions.size() == 3);
-    const void* input0_ptr = nullptr;
-    const void* input1_ptr = nullptr;
-    void* output_ptr = nullptr;
-    size_t volume = 0;
-    switch (args->bounds.get_dim()) {
-#define DIMFUNC(DIM)                                                \
-  case DIM: {                                                       \
-    const Rect<DIM> bounds = args->bounds;                          \
-    volume = bounds.volume();                                       \
-    output_ptr = TensorAccessor<LEGION_WRITE_DISCARD, DIM>::access( \
-        args->datatype, bounds, regions[0]);                        \
-    input0_ptr = TensorAccessor<LEGION_READ_ONLY, DIM>::access(     \
-        args->datatype, bounds, regions[1]);                        \
-    input1_ptr = TensorAccessor<LEGION_READ_ONLY, DIM>::access(     \
-        args->datatype, bounds, regions[2]);                        \
-    break;                                                          \
-  }
-      LEGION_FOREACH_N(DIMFUNC)
-#undef DIMFUNC
-      default:
-        abort();
-    }
-    forward_kernel(args, stream, input0_ptr, input1_ptr, output_ptr, volume);
-  }
-  if (args->profiling) {
-#ifdef DISABLE_LEGION_CUDA_HIJACK
-    CHECK_CUDA(cudaEventRecord(t_end));
-#else
-    CHECK_CUDA(cudaEventRecord(t_start, stream));
-#endif
-    CHECK_CUDA(cudaEventSynchronize(t_end));
-    float elapsed = 0;
-    CHECK_CUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
-    CHECK_CUDA(cudaEventDestroy(t_start));
-    CHECK_CUDA(cudaEventDestroy(t_end));
-    printf(
-        "%s [Binary] forward time (CF) = %.2fms\n",
-        args->owner->op_name.c_str(), elapsed);
-  }
-}
-
-/*static*/ bool
-BinaryOperator::use_cudnn(OperatorType optype, DataType dtype)
-{
-  if (to_op_tensor_comp_type(dtype, dtype, dtype) != CUDNN_DATA_UINT8) {
-    switch (optype) {
-      case OperatorType::OP_EW_ADD:
-      case OperatorType::OP_EW_SUB:
-      case OperatorType::OP_EW_MUL:
-        return true;
-      default:
-        return false;
-    }
-  }
-  return false;
-}
-
-#endif
-
-}}}  // namespace triton::backend::legion
diff --git a/triton/src/operators/binary.cu b/triton/src/operators/binary.cu
deleted file mode 100644
index 7410d098f1..0000000000
--- a/triton/src/operators/binary.cu
+++ /dev/null
@@ -1,257 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "binary.h"
-
-#include "mathtypes/half.h"
-
-using namespace Legion;
-
-namespace triton { namespace backend { namespace legion {
-
-__global__ static void
-binary_forward_half(
-    const __half* input0, const __half* input1, __half* output,
-    const __half alpha, const __half beta, const OperatorType optype,
-    const size_t volume)
-{
-  const size_t offset = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
-  if (offset >= volume)
-    return;
-  switch (optype) {
-    case OP_EW_ADD: {
-      output[offset] =
-          alpha * (input0[offset] + input1[offset]) + beta * output[offset];
-      break;
-    }
-    case OP_EW_SUB: {
-      output[offset] =
-          alpha * (input0[offset] - input1[offset]) + beta * output[offset];
-      break;
-    }
-    case OP_EW_MUL: {
-      output[offset] =
-          alpha * (input0[offset] * input1[offset]) + beta * output[offset];
-      break;
-    }
-    case OP_EW_DIV: {
-      output[offset] =
-          alpha * (input0[offset] / input1[offset]) + beta * output[offset];
-      break;
-    }
-    default:
-      break;
-  }
-}
-
-__global__ static void
-binary_forward_float(
-    const float* input0, const float* input1, float* output, const float alpha,
-    const float beta, const OperatorType optype, const size_t volume)
-{
-  const size_t offset = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
-  if (offset >= volume)
-    return;
-  switch (optype) {
-    case OP_EW_ADD: {
-      output[offset] =
-          alpha * (input0[offset] + input1[offset]) + beta * output[offset];
-      break;
-    }
-    case OP_EW_SUB: {
-      output[offset] =
-          alpha * (input0[offset] - input1[offset]) + beta * output[offset];
-      break;
-    }
-    case OP_EW_MUL: {
-      output[offset] =
-          alpha * (input0[offset] * input1[offset]) + beta * output[offset];
-      break;
-    }
-    case OP_EW_DIV: {
-      output[offset] =
-          alpha * (input0[offset] / input1[offset]) + beta * output[offset];
-      break;
-    }
-    default:
-      break;
-  }
-}
-
-__global__ static void
-binary_forward_double(
-    const double* input0, const double* input1, double* output,
-    const double alpha, const double beta, const OperatorType optype,
-    const size_t volume)
-{
-  const size_t offset = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
-  if (offset >= volume)
-    return;
-  switch (optype) {
-    case OP_EW_ADD: {
-      output[offset] =
-          alpha * (input0[offset] + input1[offset]) + beta * output[offset];
-      break;
-    }
-    case OP_EW_SUB: {
-      output[offset] =
-          alpha * (input0[offset] - input1[offset]) + beta * output[offset];
-      break;
-    }
-    case OP_EW_MUL: {
-      output[offset] =
-          alpha * (input0[offset] * input1[offset]) + beta * output[offset];
-      break;
-    }
-    case OP_EW_DIV: {
-      output[offset] =
-          alpha * (input0[offset] / input1[offset]) + beta * output[offset];
-      break;
-    }
-    default:
-      break;
-  }
-}
-
-__global__ static void
-binary_forward_int8(
-    const int8_t* input0, const int8_t* input1, int8_t* output,
-    const int8_t alpha, const int8_t beta, const OperatorType optype,
-    const size_t volume)
-{
-  const size_t offset = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
-  if (offset >= volume)
-    return;
-  switch (optype) {
-    case OP_EW_ADD: {
-      output[offset] =
-          alpha * (input0[offset] + input1[offset]) + beta * output[offset];
-      break;
-    }
-    case OP_EW_SUB: {
-      output[offset] =
-          alpha * (input0[offset] - input1[offset]) + beta * output[offset];
-      break;
-    }
-    case OP_EW_MUL: {
-      output[offset] =
-          alpha * (input0[offset] * input1[offset]) + beta * output[offset];
-      break;
-    }
-    case OP_EW_DIV: {
-      output[offset] =
-          alpha * (input0[offset] / input1[offset]) + beta * output[offset];
-      break;
-    }
-    default:
-      break;
-  }
-}
-
-__host__
-    /*static*/ void
-    BinaryOperator::forward_kernel(
-        const BinaryArgs* args, ::cudaStream_t stream, const void* input0_ptr,
-        const void* input1_ptr, void* output_ptr, size_t num_elements)
-{
-  if (use_cudnn(args->op_type, args->datatype)) {
-    switch (args->datatype) {
-      case DataType::DT_DOUBLE: {
-        double alpha0 = 1.0,
-               alpha1 = (args->op_type == OperatorType::OP_EW_SUB) ? -1.0 : 1.0,
-               beta = 0.0;
-        CHECK_CUDNN(cudnnOpTensor(
-            args->cudnn, args->opDesc, &alpha0, args->input0Tensor, input0_ptr,
-            &alpha1, args->input1Tensor, input1_ptr, &beta, args->outputTensor,
-            output_ptr));
-        break;
-      }
-      case DataType::DT_FLOAT: {
-        float alpha0 = 1.f,
-              alpha1 = (args->op_type == OperatorType::OP_EW_SUB) ? -1.f : 1.f,
-              beta = 0.f;
-        CHECK_CUDNN(cudnnOpTensor(
-            args->cudnn, args->opDesc, &alpha0, args->input0Tensor, input0_ptr,
-            &alpha1, args->input1Tensor, input1_ptr, &beta, args->outputTensor,
-            output_ptr));
-        break;
-      }
-      case DataType::DT_INT8: {
-        int8_t alpha0 = 1,
-               alpha1 = (args->op_type == OperatorType::OP_EW_SUB) ? -1 : 1,
-               beta = 0;
-        CHECK_CUDNN(cudnnOpTensor(
-            args->cudnn, args->opDesc, &alpha0, args->input0Tensor, input0_ptr,
-            &alpha1, args->input1Tensor, input1_ptr, &beta, args->outputTensor,
-            output_ptr));
-        break;
-      }
-      case DataType::DT_HALF: {
-        __half alpha0 = 1.f,
-               alpha1 = (args->op_type == OperatorType::OP_EW_SUB) ? -1.f : 1.f,
-               beta = 0.f;
-        CHECK_CUDNN(cudnnOpTensor(
-            args->cudnn, args->opDesc, &alpha0, args->input0Tensor, input0_ptr,
-            &alpha1, args->input1Tensor, input1_ptr, &beta, args->outputTensor,
-            output_ptr));
-        break;
-      }
-      default:
-        abort();
-        break;
-    }
-  } else {
-    const size_t blocks =
-        (num_elements + (THREADS_PER_BLOCK - 1)) / THREADS_PER_BLOCK;
-    assert(
-        (args->op_type == OP_EW_ADD) || (args->op_type == OP_EW_SUB) ||
-        (args->op_type == OP_EW_MUL) || (args->op_type == OP_EW_DIV));
-    switch (args->datatype) {
-      case DataType::DT_DOUBLE: {
-        double alpha = 1.0, beta = 0.0;
-        binary_forward_double<<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
-            (const double*)input0_ptr, (const double*)input1_ptr,
-            (double*)output_ptr, alpha, beta, args->op_type, num_elements);
-        break;
-      }
-      case DataType::DT_FLOAT: {
-        float alpha = 1.f, beta = 0.f;
-        binary_forward_float<<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
-            (const float*)input0_ptr, (const float*)input1_ptr,
-            (float*)output_ptr, alpha, beta, args->op_type, num_elements);
-        break;
-      }
-      case DataType::DT_INT8: {
-        int8_t alpha = 1, beta = 0;
-        binary_forward_int8<<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
-            (const int8_t*)input0_ptr, (const int8_t*)input1_ptr,
-            (int8_t*)output_ptr, alpha, beta, args->op_type, num_elements);
-        break;
-      }
-      case DataType::DT_HALF: {
-        __half alpha = 1.f, beta = 0.f;
-        binary_forward_half<<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
-            (const __half*)input0_ptr, (const __half*)input1_ptr,
-            (__half*)output_ptr, alpha, beta, args->op_type, num_elements);
-        break;
-      }
-      default:
-        abort();
-        break;
-    }
-  }
-}
-
-}}}  // namespace triton::backend::legion
diff --git a/triton/src/operators/binary.h b/triton/src/operators/binary.h
deleted file mode 100644
index 1c889bd7a5..0000000000
--- a/triton/src/operators/binary.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __LEGION_TRITON_BINARY_H__
-#define __LEGION_TRITON_BINARY_H__
-
-#include "operator.h"
-#include "tensor.h"
-
-namespace triton { namespace backend { namespace legion {
-
-struct BinaryArgs : public OperatorArgs {
- public:
-  BinaryArgs() = default;
-#ifdef LEGION_USE_CUDA
-  cudnnHandle_t cudnn;
-  cudnnTensorDescriptor_t input0Tensor, input1Tensor, outputTensor;
-  cudnnOpTensorDescriptor_t opDesc;
-#endif
-  OperatorType op_type;
-  Legion::Domain bounds;
-  DataType datatype;
-  bool inplace;
-};
-
-class BinaryOperator : public Operator {
- public:
-  BinaryOperator(
-      LegionModelState* model, const LayerStrategy* strategy, OperatorType type,
-      bool inplace, const char* name);
-  virtual ~BinaryOperator() = default;
-
-  void Configure(Tensor* input0, Tensor* input1, Tensor* output);
-  Legion::Domain GetBounds(Realm::Processor proc);
-
-  virtual void Load(Realm::Processor processor) override;
-  virtual void initialize(
-      LegionModelInstance* instance, const unsigned instance_index,
-      Legion::Runtime* runtime, Legion::Context ctx,
-      Legion::MapperID mapper) override;
-  virtual void forward(
-      LegionModelInstance* instance, const unsigned instance_index,
-      Legion::Runtime* runtime, Legion::Context ctx,
-      Legion::MapperID mapper) override;
-  virtual void finalize(
-      LegionModelInstance* instance, const unsigned instance_index,
-      Legion::Runtime* runtime, Legion::Context ctx,
-      Legion::MapperID mapper) override;
-  virtual void Free(Realm::Processor processor) override;
-
-  static void PreregisterTaskVariants(void);
-  static void forward_cpu(
-      const Legion::Task* task,
-      const std::vector<Legion::PhysicalRegion>& regions, Legion::Context ctx,
-      Legion::Runtime* runtime);
-#ifdef LEGION_USE_CUDA
-  static void forward_gpu(
-      const Legion::Task* task,
-      const std::vector<Legion::PhysicalRegion>& regions, Legion::Context ctx,
-      Legion::Runtime* runtime);
-
- protected:
-  static bool use_cudnn(OperatorType optype, DataType dtype);
-  static void forward_kernel(
-      const BinaryArgs* args, ::cudaStream_t stream, const void* input0_ptr,
-      const void* input1_ptr, void* output_ptr, size_t num_elements);
-#endif
- public:
-  const bool inplace;
-
- protected:
-  BinaryArgs args[MAX_LOCAL_PROCS];
-  Legion::FutureMap argmaps[MAX_NUM_INSTANCES];
-  Legion::IndexTaskLauncher launchers[MAX_NUM_INSTANCES];
-};
-
-}}}  // namespace triton::backend::legion
-
-#endif  // __LEGION_TRITON_BINARY_H__
diff --git a/triton/src/operators/concat.cc b/triton/src/operators/concat.cc
deleted file mode 100644
index 9f2ad1027e..0000000000
--- a/triton/src/operators/concat.cc
+++ /dev/null
@@ -1,456 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "concat.h"
-
-using namespace Legion;
-
-namespace triton { namespace backend { namespace legion {
-
-Legion::ProjectionID Concat::filter_functor_id;
-
-LogicalRegion
-FilterProjectionFunctor::project(
-    LogicalPartition upper_bound, const DomainPoint& point,
-    const Domain& domain)
-{
-  // Check to see if the point is in the color space
-  const Domain limits = runtime->get_index_partition_color_space(
-      upper_bound.get_index_partition());
-  if (limits.contains(point))
-    return runtime->get_logical_subregion_by_color(upper_bound, point);
-  else
-    return LogicalRegion::NO_REGION;
-}
-
-ConcatArgs::ConcatArgs(void) : local_index(0), datatype(DT_NONE), axis(-1) {}
-
-Concat::Concat(
-    LegionModelState* model, const LayerStrategy* strategy, size_t inputs,
-    int ax, const char* name)
-    : Operator(model, strategy, OperatorType::OP_CONCAT, name, inputs, 0, 1),
-      axis(ax)
-{
-  assert(inputs > 0);
-}
-
-void
-Concat::Configure(const std::vector<Tensor*>& ins, Tensor* out)
-{
-  assert(num_inputs == ins.size());
-  inputs = ins;
-  size_t axis_size = 0;
-  const size_t dims = out->bounds.size();
-  assert(dims == strategy->nDims);
-  for (unsigned idx = 0; idx < inputs.size(); idx++) {
-    assert(inputs[idx]->type == out->type);
-    assert(inputs[idx]->bounds.size() == dims);
-    for (unsigned d = 0; d < dims; d++) {
-      if (d == axis)
-        axis_size += inputs[idx]->bounds[d];
-      else
-        assert(inputs[idx]->bounds[d] == out->bounds[d]);
-    }
-  }
-  assert(axis_size == out->bounds[axis]);
-  outputs.push_back(out);
-  // Figure out the output tiling domain
-  std::vector<size_t> tile_sizes(dims);
-  for (unsigned d = 0; d < dims; d++)
-    tile_sizes[d] = (out->bounds[d] + strategy->dim[d] - 1) / strategy->dim[d];
-  coord_t offset = 0;
-  // Now compute the domains and transforms needed for constructing
-  // the partitions for each of the inputs
-  input_color_spaces.resize(num_inputs);
-  input_extents.resize(num_inputs);
-  for (unsigned idx = 0; idx < num_inputs; idx++) {
-    DomainPoint lo, hi, color_lo, color_hi;
-    lo.dim = dims;
-    hi.dim = dims;
-    color_lo.dim = dims;
-    color_hi.dim = dims;
-    for (int d = 0; d < dims; d++) {
-      if (d == axis) {
-        const coord_t extent = inputs[idx]->bounds[d];
-        lo[d] = -offset;
-        hi[d] = (tile_sizes[d] - 1 /*inclusive*/) - offset;
-        color_lo[d] = offset / tile_sizes[d];
-        color_hi[d] = (offset + extent - 1) / tile_sizes[d];
-        offset += extent;
-      } else {
-        lo[d] = 0;
-        hi[d] = tile_sizes[d] - 1;  // make it inclusive
-        color_lo[d] = 0;
-        color_hi[d] = strategy->dim[d] - 1;  // make it inclusive
-      }
-    }
-    input_color_spaces[idx] = Domain(color_lo, color_hi);
-    input_extents[idx] = Domain(lo, hi);
-  }
-  // The input transform is the same across all the inputs
-  switch (dims) {
-#define DIMFUNC(N)                         \
-  case N: {                                \
-    Transform<N, N> transform;             \
-    for (int i = 0; i < N; i++)            \
-      for (int j = 0; j < N; j++)          \
-        if (i == j)                        \
-          transform[i][j] = tile_sizes[i]; \
-        else                               \
-          transform[i][j] = 0;             \
-    input_transform = transform;           \
-    break;                                 \
-  }
-    LEGION_FOREACH_N(DIMFUNC)
-#undef DIMFUNC
-    default:
-      abort();
-  }
-}
-
-Domain
-Concat::GetBounds(Processor proc)
-{
-  const size_t dims = outputs[0]->bounds.size();
-  DomainPoint lo, hi;
-  lo.dim = dims;
-  hi.dim = dims;
-  for (int d = 0; d < dims; d++) {
-    lo[d] = 0;
-    hi[d] = outputs[0]->bounds[d] - 1;
-  }
-  const Domain global(lo, hi);
-  return strategy->find_local_domain(proc, global);
-}
-
-void
-Concat::Load(Processor proc)
-{
-  assert(proc.kind() == strategy->kind);
-  assert(inputs[0]->bounds.size() == size_t(strategy->nDims));
-  // If this processor is not used for this layer there is nothing to do
-  if (!strategy->is_local_processor(proc))
-    return;
-  const unsigned local_index = strategy->find_local_offset(proc);
-  ConcatArgs& proc_args = args[local_index];
-  proc_args.owner = this;
-  proc_args.local_index = local_index;
-  proc_args.bounds = GetBounds(proc);
-  proc_args.datatype = inputs[0]->type;
-  proc_args.axis = axis;
-}
-
-void
-Concat::initialize(
-    LegionModelInstance* instance, const unsigned instance_index,
-    Runtime* runtime, Context ctx, MapperID mapper)
-{
-  const Domain launch_domain = strategy->get_launch_domain();
-  // Find or create the launch space domain
-  IndexSpace launch_space = instance->find_or_create_index_space(launch_domain);
-  // Also get the sharding function from the strategy
-  ShardingFunction* shardfn = strategy->sharding_function;
-  // Construct a future map for the pass-by-value arguments
-  std::map<DomainPoint, TaskArgument> values;
-  for (Domain::DomainPointIterator itr(launch_domain); itr; itr++) {
-    const Processor proc = shardfn->find_proc(itr.p, launch_domain);
-    if (!strategy->is_local_processor(proc))
-      continue;
-    const unsigned local_index = strategy->find_local_offset(proc);
-    values[itr.p] = TaskArgument(args + local_index, sizeof(ConcatArgs));
-  }
-  argmaps[instance_index] = runtime->construct_future_map(
-      ctx, launch_space, values, true /*collective*/, shardfn->sharding_id);
-
-  IndexTaskLauncher& launcher = launchers[instance_index];
-  launcher = IndexTaskLauncher(
-      CONCAT_TASK_ID, launch_space, TaskArgument(NULL, 0),
-      ArgumentMap(argmaps[instance_index]), Predicate::TRUE_PRED,
-      false /*must*/, mapper, strategy->tag);
-  LogicalRegion output_region = instance->create_tensor_region(outputs[0]);
-  LogicalPartition output_part =
-      instance->find_or_create_tiled_partition(outputs[0], strategy);
-  launcher.add_region_requirement(RegionRequirement(
-      output_part, 0 /*projection id*/, LEGION_WRITE_DISCARD, LEGION_EXCLUSIVE,
-      output_region));
-  launcher.add_field(0, FID_DATA);
-  assert(inputs.size() == input_color_spaces.size());
-  assert(inputs.size() == input_extents.size());
-  // Now go through and create partitions for each of the input regions
-  for (unsigned idx = 0; idx < inputs.size(); idx++) {
-    IndexSpace input_color_space =
-        instance->find_or_create_index_space(input_color_spaces[idx]);
-    LogicalRegion input_region = inputs[idx]->region[instance_index];
-    IndexPartition index_part = instance->find_or_create_partition(
-        input_region.get_index_space(), input_color_space, input_transform,
-        input_extents[idx], LEGION_DISJOINT_COMPLETE_KIND);
-    LogicalPartition input_part = runtime->get_logical_partition_by_tree(
-        ctx, index_part, input_region.get_field_space(),
-        input_region.get_tree_id());
-    launcher.add_region_requirement(RegionRequirement(
-        input_part, filter_functor_id, LEGION_READ_ONLY, LEGION_EXCLUSIVE,
-        input_region));
-    launcher.add_field(idx + 1 /*include output*/, FID_DATA);
-  }
-}
-
-void
-Concat::forward(
-    LegionModelInstance* instance, const unsigned instance_index,
-    Runtime* runtime, Context ctx, MapperID mapper)
-{
-  runtime->execute_index_space(ctx, launchers[instance_index]);
-}
-
-void
-Concat::finalize(
-    LegionModelInstance* instance, const unsigned instance_index,
-    Runtime* runtime, Context ctx, MapperID mapper)
-{
-  argmaps[instance_index] = FutureMap();
-}
-
-void
-Concat::Free(Processor proc)
-{
-  assert(proc.kind() == strategy->kind);
-}
-
-/*static*/ void
-Concat::PreregisterTaskVariants(void)
-{
-  {
-    // Register our special projection functor with the runtime
-    filter_functor_id = Runtime::generate_static_projection_id();
-    Runtime::preregister_projection_functor(
-        filter_functor_id, new FilterProjectionFunctor);
-  }
-  {
-    TaskVariantRegistrar cpu_registrar(CONCAT_TASK_ID, "Concat CPU");
-    cpu_registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    cpu_registrar.set_leaf();
-    Runtime::preregister_task_variant<forward_cpu>(
-        cpu_registrar, "Concat Operator");
-  }
-#ifdef LEGION_USE_CUDA
-  {
-    TaskVariantRegistrar gpu_registrar(CONCAT_TASK_ID, "Concat GPU");
-    gpu_registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
-    gpu_registrar.set_leaf();
-    Runtime::preregister_task_variant<forward_gpu>(
-        gpu_registrar, "Concat Operator");
-  }
-#endif
-}
-
-/*static*/ void
-Concat::forward_cpu(
-    const Task* task, const std::vector<PhysicalRegion>& regions, Context ctx,
-    Runtime* runtime)
-{
-  assert(task->local_arglen == sizeof(ConcatArgs));
-  const ConcatArgs* args = (const ConcatArgs*)task->local_args;
-  assert(regions.size() >= 2);
-  assert(regions.size() == task->regions.size());
-  uint8_t* output_ptr = nullptr;
-  size_t total_elements = 1;
-  size_t element_stride = sizeof_datatype(args->datatype);
-  switch (args->bounds.get_dim()) {
-#define DIMFUNC(DIM)                                                          \
-  case DIM: {                                                                 \
-    const Rect<DIM> bounds = args->bounds;                                    \
-    output_ptr = (uint8_t*)TensorAccessor<LEGION_WRITE_DISCARD, DIM>::access( \
-        args->datatype, bounds, regions[0]);                                  \
-    for (int d = DIM - 1; d >= 0; d--) {                                      \
-      element_stride *= ((bounds.hi[d] - bounds.lo[d]) + 1);                  \
-      if (d == args->axis)                                                    \
-        break;                                                                \
-    }                                                                         \
-    for (int d = 0; d < args->axis; d++)                                      \
-      total_elements *= ((bounds.hi[d] - bounds.lo[d]) + 1);                  \
-    break;                                                                    \
-  }
-    LEGION_FOREACH_N(DIMFUNC)
-#undef DIMFUNC
-    default:
-      abort();
-  }
-  for (unsigned idx = 1; idx < regions.size(); idx++) {
-    // Skip any regions which have been masked off for this point task
-    LogicalRegion region = task->regions[idx].region;
-    if (!region.exists())
-      continue;
-    const Domain input_domain =
-        runtime->get_index_space_domain(region.get_index_space());
-    assert(input_domain.get_dim() == args->bounds.get_dim());
-    const uint8_t* input_ptr = nullptr;
-    size_t element_size = sizeof_datatype(args->datatype);
-    switch (input_domain.get_dim()) {
-#define DIMFUNC(DIM)                                                           \
-  case DIM: {                                                                  \
-    const Rect<DIM> bounds = input_domain;                                     \
-    assert(!bounds.empty());                                                   \
-    input_ptr = (const uint8_t*)TensorAccessor<LEGION_READ_ONLY, DIM>::access( \
-        args->datatype, bounds, regions[idx]);                                 \
-    for (int d = DIM - 1; d >= 0; d--) {                                       \
-      element_size *= ((bounds.hi[d] - bounds.lo[d]) + 1);                     \
-      if (d == args->axis)                                                     \
-        break;                                                                 \
-    }                                                                          \
-    for (int d = 0; d < args->axis; d++) {                                     \
-      assert(bounds.lo[d] == args->bounds.lo()[d]);                            \
-      assert(bounds.hi[d] == args->bounds.hi()[d]);                            \
-    }                                                                          \
-    break;                                                                     \
-  }
-      LEGION_FOREACH_N(DIMFUNC)
-#undef DIMFUNC
-      default:
-        abort();
-    }
-    uint8_t* current_ptr = output_ptr;
-    for (size_t element = 0; element < total_elements; element++) {
-      memcpy(current_ptr, input_ptr, element_size);
-      input_ptr += element_size;
-      current_ptr += element_stride;
-    }
-    // Update the output ptr with the new offset for the next set of elements
-    output_ptr += element_size;
-  }
-}
-
-#ifdef LEGION_USE_CUDA
-/*static*/ void
-Concat::forward_gpu(
-    const Task* task, const std::vector<PhysicalRegion>& regions, Context ctx,
-    Runtime* runtime)
-{
-  assert(task->local_arglen == sizeof(ConcatArgs));
-  const ConcatArgs* args = (const ConcatArgs*)task->local_args;
-#ifndef DISABLE_LEGION_CUDA_HIJACK
-  ::cudaStream_t stream;
-  CHECK_CUDA(cudaStreamCreate(&stream));
-#endif
-  ::cudaEvent_t t_start, t_end;
-  if (args->profiling) {
-    CHECK_CUDA(cudaEventCreate(&t_start));
-    CHECK_CUDA(cudaEventCreate(&t_end));
-#ifdef DISABLE_LEGION_CUDA_HIJACK
-    CHECK_CUDA(cudaEventRecord(t_start));
-#else
-    CHECK_CUDA(cudaEventRecord(t_start, stream));
-#endif
-  }
-  assert(regions.size() >= 2);
-  assert(regions.size() == task->regions.size());
-  uint8_t* output_ptr = nullptr;
-  size_t total_elements = 1;
-  size_t element_stride = sizeof_datatype(args->datatype);
-  switch (args->bounds.get_dim()) {
-#define DIMFUNC(DIM)                                                          \
-  case DIM: {                                                                 \
-    const Rect<DIM> bounds = args->bounds;                                    \
-    output_ptr = (uint8_t*)TensorAccessor<LEGION_WRITE_DISCARD, DIM>::access( \
-        args->datatype, bounds, regions[0]);                                  \
-    for (int d = DIM - 1; d >= 0; d--) {                                      \
-      element_stride *= ((bounds.hi[d] - bounds.lo[d]) + 1);                  \
-      if (d == args->axis)                                                    \
-        break;                                                                \
-    }                                                                         \
-    for (int d = 0; d < args->axis; d++)                                      \
-      total_elements *= ((bounds.hi[d] - bounds.lo[d]) + 1);                  \
-    break;                                                                    \
-  }
-    LEGION_FOREACH_N(DIMFUNC)
-#undef DIMFUNC
-    default:
-      abort();
-  }
-  for (unsigned idx = 1; idx < regions.size(); idx++) {
-    // Skip any regions which have been masked off for this point task
-    LogicalRegion region = task->regions[idx].region;
-    if (!region.exists())
-      continue;
-    const Domain input_domain =
-        runtime->get_index_space_domain(region.get_index_space());
-    assert(input_domain.get_dim() == args->bounds.get_dim());
-    const uint8_t* input_ptr = nullptr;
-    size_t element_size = sizeof_datatype(args->datatype);
-    switch (input_domain.get_dim()) {
-#define DIMFUNC(DIM)                                                           \
-  case DIM: {                                                                  \
-    const Rect<DIM> bounds = input_domain;                                     \
-    assert(!bounds.empty());                                                   \
-    input_ptr = (const uint8_t*)TensorAccessor<LEGION_READ_ONLY, DIM>::access( \
-        args->datatype, bounds, regions[idx]);                                 \
-    for (int d = DIM - 1; d >= 0; d--) {                                       \
-      element_size *= ((bounds.hi[d] - bounds.lo[d]) + 1);                     \
-      if (d == args->axis)                                                     \
-        break;                                                                 \
-    }                                                                          \
-    for (int d = 0; d < args->axis; d++) {                                     \
-      assert(bounds.lo[d] == args->bounds.lo()[d]);                            \
-      assert(bounds.hi[d] == args->bounds.hi()[d]);                            \
-    }                                                                          \
-    break;                                                                     \
-  }
-      LEGION_FOREACH_N(DIMFUNC)
-#undef DIMFUNC
-      default:
-        abort();
-    }
-    if (total_elements == 1) {
-      assert(element_stride == element_size);
-#ifdef DISABLE_LEGION_CUDA_HIJACK
-      CHECK_CUDA(cudaMemcpyAsync(
-          output_ptr, input_ptr, element_size, cudaMemcpyDeviceToDevice));
-#else
-      CHECK_CUDA(cudaMemcpyAsync(
-          output_ptr, input_ptr, element_size, cudaMemcpyDeviceToDevice,
-          stream));
-#endif
-    } else {
-#ifdef DISABLE_LEGION_CUDA_HIJACK
-      CHECK_CUDA(cudaMemcpy2DAsync(
-          output_ptr, element_stride, input_ptr, element_size, element_size,
-          total_elements, cudaMemcpyDeviceToDevice));
-#else
-      CHECK_CUDA(cudaMemcpy2DAsync(
-          output_ptr, element_stride, input_ptr, element_size, element_size,
-          total_elements, cudaMemcpyDeviceToDevice, stream));
-#endif
-    }
-    // Update the output ptr with the new offset for the next set of elements
-    output_ptr += element_size;
-  }
-  if (args->profiling) {
-#ifdef DISABLE_LEGION_CUDA_HIJACK
-    CHECK_CUDA(cudaEventRecord(t_end));
-#else
-    CHECK_CUDA(cudaEventRecord(t_start, stream));
-#endif
-    CHECK_CUDA(cudaEventSynchronize(t_end));
-    float elapsed = 0;
-    CHECK_CUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
-    CHECK_CUDA(cudaEventDestroy(t_start));
-    CHECK_CUDA(cudaEventDestroy(t_end));
-    printf(
-        "%s [Concat] forward time (CF) = %.2fms\n",
-        args->owner->op_name.c_str(), elapsed);
-  }
-}
-#endif
-
-}}}  // namespace triton::backend::legion
diff --git a/triton/src/operators/concat.h b/triton/src/operators/concat.h
deleted file mode 100644
index ebcbedf09b..0000000000
--- a/triton/src/operators/concat.h
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __LEGION_TRITON_CONCAT_H__
-#define __LEGION_TRITON_CONCAT_H__
-
-#include "operator.h"
-#include "tensor.h"
-#ifdef LEGION_USE_CUDA
-#include "cudahelp.h"
-#endif
-
-namespace triton { namespace backend { namespace legion {
-
-class FilterProjectionFunctor : public Legion::ProjectionFunctor {
- public:
-  virtual Legion::LogicalRegion project(
-      Legion::LogicalPartition upper_bound, const Legion::DomainPoint& point,
-      const Legion::Domain& domain) override;
-
- public:
-  virtual bool is_functional(void) const override { return true; }
-  virtual unsigned get_depth(void) const override { return 0; }
-};
-
-struct ConcatArgs : public OperatorArgs {
- public:
-  ConcatArgs(void);
-
- public:
-  unsigned local_index;
-  Legion::Domain bounds;
-  DataType datatype;
-  int axis;
-};
-
-class Concat : public Operator {
- public:
-  Concat(
-      LegionModelState* model, const LayerStrategy* strategy, size_t inputs,
-      int axis, const char* name);
-
- public:
-  void Configure(const std::vector<Tensor*>& inputs, Tensor* output);
-  Legion::Domain GetBounds(Realm::Processor proc);
-
- public:
-  virtual void Load(Realm::Processor processor) override;
-  virtual void initialize(
-      LegionModelInstance* instance, const unsigned instance_index,
-      Legion::Runtime* runtime, Legion::Context ctx,
-      Legion::MapperID mapper) override;
-  virtual void forward(
-      LegionModelInstance* instance, const unsigned instance_index,
-      Legion::Runtime* runtime, Legion::Context ctx,
-      Legion::MapperID mapper) override;
-  virtual void finalize(
-      LegionModelInstance* instance, const unsigned instance_index,
-      Legion::Runtime* runtime, Legion::Context ctx,
-      Legion::MapperID mapper) override;
-  virtual void Free(Realm::Processor processor) override;
-
- public:
-  static void PreregisterTaskVariants(void);
-  static void forward_cpu(
-      const Legion::Task* task,
-      const std::vector<Legion::PhysicalRegion>& regions, Legion::Context ctx,
-      Legion::Runtime* runtime);
-#ifdef LEGION_USE_CUDA
- public:
-  // Forward task for the GPU
-  static void forward_gpu(
-      const Legion::Task* task,
-      const std::vector<Legion::PhysicalRegion>& regions, Legion::Context ctx,
-      Legion::Runtime* runtime);
-#endif
- public:
-  const int axis;
-  static Legion::ProjectionID filter_functor_id;
-
- protected:
-  ConcatArgs args[MAX_LOCAL_PROCS];
-  Legion::FutureMap argmaps[MAX_NUM_INSTANCES];
-  Legion::IndexTaskLauncher launchers[MAX_NUM_INSTANCES];
-  std::vector<Legion::Domain> input_color_spaces;
-  std::vector<Legion::Domain> input_extents;
-  Legion::DomainTransform input_transform;
-};
-
-}}}  // namespace triton::backend::legion
-
-#endif  // __LEGION_TRITON_CONCAT_H__
diff --git a/triton/src/operators/conv2d.cc b/triton/src/operators/conv2d.cc
deleted file mode 100644
index 01d7077146..0000000000
--- a/triton/src/operators/conv2d.cc
+++ /dev/null
@@ -1,594 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "conv2d.h"
-
-using namespace Legion;
-
-namespace triton { namespace backend { namespace legion {
-
-Conv2D::Conv2D(
-    LegionModelState* model, const LayerStrategy* strategy, size_t inChannels,
-    size_t outChannels, size_t kernelH, size_t kernelW, size_t strideH,
-    size_t strideW, size_t paddingH, size_t paddingW, ActivationMode act,
-    size_t gps, bool bias, const char* name)
-    : Operator(
-          model, strategy, OP_CONV2D, name, 1 /*inputs*/,
-          bias ? 2 : 1 /*weights*/, 1 /*outputs*/),
-      activation(act), in_channels(inChannels), out_channels(outChannels),
-      kernel_h(kernelH), kernel_w(kernelW), stride_h(strideH),
-      stride_w(strideW), padding_h(paddingH), padding_w(paddingW), groups(gps),
-      use_bias(bias)
-{
-  assert(strategy->nDims == 4);
-  // We don't support partitioning over the channel dimension right now
-  assert(strategy->dim[1] == 1);
-}
-
-Conv2D::~Conv2D(void) {}
-
-void
-Conv2D::Configure(Tensor* input, Weights* wts, Tensor* output, Weights* bias)
-{
-  assert(input != nullptr);
-  assert(in_channels == input->bounds[1]);
-  assert(wts != nullptr);
-  assert(output != nullptr);
-  if (use_bias)
-    assert(bias != nullptr);
-  else
-    assert(bias == nullptr);
-  inputs.push_back(input);
-  outputs.push_back(output);
-  weights.push_back(wts);
-  if (use_bias)
-    weights.push_back(bias);
-  // Compute the input transform and extent based on the bounds and our strategy
-  Point<4> tiles;
-  // Compute the default chunk sizes along each dimension
-  for (int i = 0; i < 4; i++)
-    tiles[i] = (input->bounds[i] + strategy->dim[i] - 1) / strategy->dim[i];
-  Transform<4, 4> transform;
-  for (int i = 0; i < 4; i++)
-    for (int j = 0; j < 4; j++)
-      if (i == j)
-        transform[i][j] = tiles[i];
-      else
-        transform[i][j] = 0;
-  input_transform = transform;
-  Rect<4> extent;
-  for (int i = 0; i < 4; i++) {
-    if (i < 2) {
-      extent.lo[i] = 0;
-      extent.hi[i] = tiles[i] - 1;
-    } else {
-      // Compute the ghost boundaries on the height/width dimensions
-      coord_t ghost_cells = ((i < 2) ? kernel_w : kernel_h) / 2;
-      extent.lo[i] = -ghost_cells;
-      extent.hi[i] = tiles[i] + ghost_cells - 1;
-    }
-  }
-  input_extent = extent;
-}
-
-Rect<4>
-Conv2D::GetInputBounds(Processor proc)
-{
-  const Point<4> point = strategy->find_local_point(proc);
-  const Transform<4, 4> transform = input_transform;
-  const Point<4> offset = transform * point;
-  const Rect<4> extent = input_extent;
-  const Rect<4> result(extent.lo + offset, extent.hi + offset);
-  Rect<4> bounds;
-  for (int d = 0; d < 4; d++) {
-    bounds.lo[d] = 0;
-    bounds.hi[d] = inputs[0]->bounds[d] - 1;
-  }
-  return result.intersection(bounds);
-}
-
-Rect<4>
-Conv2D::GetOutputBounds(Processor proc)
-{
-  DomainPoint lo, hi;
-  lo.dim = 4;
-  hi.dim = 4;
-  for (int d = 0; d < 4; d++) {
-    lo[d] = 0;
-    hi[d] = outputs[0]->bounds[d] - 1;
-  }
-  const Domain global(lo, hi);
-  const Rect<4> result = strategy->find_local_domain(proc, global);
-  return result;
-}
-
-Rect<4>
-Conv2D::GetWeightBounds(Realm::Processor proc)
-{
-  // Bounds for weight is similar to output, except the first dimension
-  // wouldn't be partitioned. Note that weight is in shape
-  // (out_channel, in_channel / groups, kernel_h, kernel_w)
-  DomainPoint lo, hi;
-  lo.dim = 4;
-  hi.dim = 4;
-  for (int d = 0; d < 4; d++) {
-    lo[d] = 0;
-    hi[d] = weights[0]->bounds[d] - 1;
-  }
-  const Domain global(lo, hi);
-  Rect<4> result = strategy->find_local_domain(proc, global);
-  result.lo[0] = 0;
-  result.hi[0] = weights[0]->bounds[0] - 1;
-  return result;
-}
-
-Rect<1>
-Conv2D::GetBiasBounds(Realm::Processor proc)
-{
-  // Always return the whole bias bound
-  DomainPoint lo, hi;
-  lo.dim = 1;
-  hi.dim = 1;
-  lo[0] = 0;
-  hi[0] = weights[1]->bounds[0] - 1;
-  return Rect<1>(lo, hi);
-}
-
-void
-Conv2D::Load(Processor proc)
-{
-  assert(proc.kind() == strategy->kind);
-  // If this processor is not used for this layer there is nothing to do
-  if (!strategy->is_local_processor(proc))
-    return;
-  const unsigned local_index = strategy->find_local_offset(proc);
-  Conv2DArgs& proc_args = args[local_index];
-  proc_args.owner = this;
-  proc_args.local_index = local_index;
-  proc_args.relu = (activation == AC_MODE_RELU);
-  proc_args.use_bias = use_bias;
-  const Rect<4> input = GetInputBounds(proc);
-  const Rect<4> output = GetOutputBounds(proc);
-  proc_args.input_bounds = input;
-  proc_args.local_bounds = output;
-  proc_args.input_datatype = inputs[0]->type;
-  proc_args.output_datatype = outputs[0]->type;
-  proc_args.filter_datatype = weights[0]->type;
-  if (use_bias) {
-    proc_args.bias_bounds = Rect<1>(output.lo[1], output.hi[1]);
-    proc_args.bias_datatype = weights[1]->type;
-  }
-#ifdef LEGION_USE_CUDA
-  if (proc.kind() == Processor::TOC_PROC) {
-    proc_args.cudnn = model->runtime_->cudnn[local_index];
-
-    const coord_t input_n = input.hi[0] - input.lo[0] + 1;
-    const coord_t input_c = input.hi[1] - input.lo[1] + 1;
-    const coord_t input_h = input.hi[2] - input.lo[2] + 1;
-    const coord_t input_w = input.hi[3] - input.lo[3] + 1;
-    const coord_t output_n = output.hi[0] - output.lo[0] + 1;
-    const coord_t output_c = output.hi[1] - output.lo[1] + 1;
-    const coord_t output_h = output.hi[2] - output.lo[2] + 1;
-    const coord_t output_w = output.hi[3] - output.lo[3] + 1;
-
-    CHECK_CUDNN(cudnnCreateTensorDescriptor(&proc_args.inputTensor));
-    CHECK_CUDNN(cudnnSetTensor4dDescriptor(
-        proc_args.inputTensor, CUDNN_TENSOR_NCHW,
-        to_cudnn_datatype(inputs[0]->type), input_n, input_c, input_h,
-        input_w));
-
-    if (use_bias) {
-      CHECK_CUDNN(cudnnCreateTensorDescriptor(&proc_args.biasTensor));
-      CHECK_CUDNN(cudnnSetTensor4dDescriptor(
-          proc_args.biasTensor, CUDNN_TENSOR_NCHW,
-          to_cudnn_datatype(weights[0]->type), 1, output_c, 1, 1));
-    }
-
-    // Require that input_c is divisible by groups
-    assert(input_c % groups == 0);
-    CHECK_CUDNN(cudnnCreateFilterDescriptor(&proc_args.filterDesc));
-    CHECK_CUDNN(cudnnSetFilter4dDescriptor(
-        proc_args.filterDesc, to_cudnn_datatype(outputs[0]->type),
-        CUDNN_TENSOR_NCHW, output_c, input_c / groups, kernel_h, kernel_w));
-
-    // Technically this will overpad
-    int pad_h = ((output_h - 1) * stride_h + kernel_h - input_h + 1) / 2;
-    int pad_w = ((output_w - 1) * stride_w + kernel_w - input_w + 1) / 2;
-    if (pad_h != padding_h)
-      printf("Warning: changing conv_padding_h to satisfy output_h size\n");
-    if (pad_w != padding_w)
-      printf("Warning: changing conv_padding_w to satisfy output_w size\n");
-
-    CHECK_CUDNN(cudnnCreateConvolutionDescriptor(&proc_args.convDesc));
-    CHECK_CUDNN(cudnnSetConvolution2dDescriptor(
-        proc_args.convDesc,
-        pad_h,  // padding_h,
-        pad_w,  // padding_w,
-        stride_h, stride_w, 1 /*upscale_x*/, 1 /*upscale_y*/,
-        CUDNN_CROSS_CORRELATION, to_cudnn_datatype(outputs[0]->type)));
-    if (groups != 1) {
-      CHECK_CUDNN(cudnnSetConvolutionGroupCount(proc_args.convDesc, groups));
-    }
-
-    if (model->runtime_->allowTensorOpMathConversion_) {
-      CHECK_CUDNN(cudnnSetConvolutionMathType(
-          proc_args.convDesc, CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION));
-    } else {
-      CHECK_CUDNN(cudnnSetConvolutionMathType(
-          proc_args.convDesc, CUDNN_TENSOR_OP_MATH));
-    }
-
-    int n, c, h, w;
-    CHECK_CUDNN(cudnnGetConvolution2dForwardOutputDim(
-        proc_args.convDesc, proc_args.inputTensor, proc_args.filterDesc, &n, &c,
-        &h, &w));
-    assert(n == output_n);
-    assert(c == output_c);
-    assert(h == output_h);
-    assert(w == output_w);
-
-    CHECK_CUDNN(cudnnCreateTensorDescriptor(&proc_args.outputTensor));
-    CHECK_CUDNN(cudnnSetTensor4dDescriptor(
-        proc_args.outputTensor, CUDNN_TENSOR_NCHW,
-        to_cudnn_datatype(outputs[0]->type), n, c, h, w));
-    // select forward algorithm
-    const int reqAlgCnt = 8;
-    int cnt = 0;
-    cudnnConvolutionFwdAlgoPerf_t perfResults[reqAlgCnt];
-    CHECK_CUDNN(cudnnFindConvolutionForwardAlgorithm(
-        proc_args.cudnn, proc_args.inputTensor, proc_args.filterDesc,
-        proc_args.convDesc, proc_args.outputTensor, reqAlgCnt, &cnt,
-        perfResults));
-    assert(cnt > 0);
-    CHECK_CUDNN(perfResults[0].status);
-    // printf("forwardAlgo(%d) time(%.2lf)\n", perfResults[0].algo,
-    // perfResults[0].time);
-    proc_args.fwdAlgo = perfResults[0].algo;
-
-    // figure out how much workspace size we need
-    CHECK_CUDNN(cudnnGetConvolutionForwardWorkspaceSize(
-        proc_args.cudnn, proc_args.inputTensor, proc_args.filterDesc,
-        proc_args.convDesc, proc_args.outputTensor, proc_args.fwdAlgo,
-        &proc_args.workSpaceSize));
-    if (proc_args.workSpaceSize > 0) {
-      for (unsigned idx = 0; idx < MAX_NUM_INSTANCES; idx++) {
-        void* workspace = nullptr;
-        CHECK_CUDA(cudaMalloc(&workspace, proc_args.workSpaceSize));
-        workspaces[idx][local_index] = workspace;
-      }
-    }
-
-    if (proc_args.relu) {
-      CHECK_CUDNN(cudnnCreateActivationDescriptor(&proc_args.actiDesc));
-      CHECK_CUDNN(cudnnSetActivationDescriptor(
-          proc_args.actiDesc, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, 0.0));
-    }
-
-    // Copy the filter weights down to the GPU as well
-    Machine::MemoryQuery query(Machine::get_machine());
-    query.only_kind(Memory::GPU_FB_MEM);
-    query.best_affinity_to(proc);
-    assert(query.count() > 0);
-    const Memory local_fb = query.first();
-    Weights* wts = weights[0];
-    if ((wts->local_memory[local_index].kind() != Memory::GPU_FB_MEM) ||
-        (wts->local_memory[local_index].kind() != Memory::Z_COPY_MEM)) {
-      void* device_ptr;
-      const size_t weights_size = sizeof_datatype(wts->type) *
-                                  wts->local_bounds[local_index].get_volume();
-      CHECK_CUDA(cudaMalloc(&device_ptr, weights_size));
-      CHECK_CUDA(cudaMemcpy(
-          device_ptr, wts->local_allocation[local_index], weights_size,
-          cudaMemcpyHostToDevice));
-      // Free the old allocation since we no longer need it
-      std::free(wts->local_allocation[local_index]);
-      wts->local_allocation[local_index] = device_ptr;
-      wts->local_memory[local_index] = local_fb;
-    }
-    // Note we don't copy down any bias weights since they are tiny and
-    // can be managed by legion very efficiently
-  }
-#endif
-}
-
-void
-Conv2D::initialize(
-    LegionModelInstance* instance, const unsigned instance_index,
-    Runtime* runtime, Context ctx, MapperID mapper)
-{
-  const Domain launch_domain = strategy->get_launch_domain();
-  // Find or create the launch space domain
-  IndexSpace launch_space = instance->find_or_create_index_space(launch_domain);
-  // Also get the sharding function from the strategy
-  ShardingFunction* shardfn = strategy->sharding_function;
-  // Construct a future map for the pass-by-value arguments
-  std::map<DomainPoint, TaskArgument> values;
-#ifdef LEGION_USE_CUDA
-  unsigned arg_index = 0;
-  Conv2DArgs copy_args[MAX_LOCAL_PROCS];
-#endif
-  for (Domain::DomainPointIterator itr(launch_domain); itr; itr++) {
-    const Processor proc = shardfn->find_proc(itr.p, launch_domain);
-    if (!strategy->is_local_processor(proc))
-      continue;
-    const unsigned local_index = strategy->find_local_offset(proc);
-#ifdef LEGION_USE_CUDA
-    // Need to make a copy here of the Conv2D so we can fill in our pointer
-    // without racing with other instances doing the same thing
-    assert(arg_index < MAX_LOCAL_PROCS);
-    Conv2DArgs& arg = copy_args[arg_index++];
-    arg = args[local_index];
-    arg.workSpace = workspaces[instance_index][local_index];
-    values[itr.p] = TaskArgument(&arg, sizeof(args));
-#else
-    values[itr.p] = TaskArgument(args + local_index, sizeof(Conv2DArgs));
-#endif
-  }
-  argmaps[instance_index] = runtime->construct_future_map(
-      ctx, launch_space, values, true /*collective*/, shardfn->sharding_id);
-
-  // Create logical regions for the weights and output data
-  assert(outputs.size() == 1);
-  LogicalRegion output_region = instance->create_tensor_region(outputs[0]);
-
-  // Create logical regions for the weights
-  assert(!weights.empty() && (weights.size() <= 2));
-  LogicalRegion weight_region = instance->create_tensor_region(weights[0]);
-  LogicalRegion bias_region = LogicalRegion::NO_REGION;
-  if (use_bias)
-    bias_region = instance->create_tensor_region(weights[1]);
-
-  // Create partitions for the input regions
-  assert(inputs.size() == 1);
-  assert(inputs[0]->region[instance_index].exists());
-  LogicalRegion input_region = inputs[0]->region[instance_index];
-  IndexPartition part = instance->find_or_create_partition(
-      input_region.get_index_space(), launch_space, input_transform,
-      input_extent, LEGION_COMPUTE_COMPLETE_KIND);
-  LogicalPartition input_part = runtime->get_logical_partition_by_tree(
-      ctx, part, input_region.get_field_space(), input_region.get_tree_id());
-
-  // Create partitions for the weights and output regions
-  LogicalPartition weight_part =
-      instance->find_or_create_tiled_partition(weights[0], strategy);
-  LogicalPartition output_part =
-      instance->find_or_create_tiled_partition(outputs[0], strategy);
-
-  // Attach weight logical regions to the existing buffers
-  IndexAttachLauncher weight_attach_launcher(
-      LEGION_EXTERNAL_INSTANCE, weight_region, false /*restricted*/);
-  const std::vector<FieldID> attach_field(1, FID_DATA);
-  for (Domain::DomainPointIterator itr(launch_domain); itr; itr++) {
-    const Processor proc = shardfn->find_proc(itr.p, launch_domain);
-    if (!strategy->is_local_processor(proc))
-      continue;
-    const unsigned local_index = strategy->find_local_offset(proc);
-    const LogicalRegion weight_lr =
-        runtime->get_logical_subregion_by_color(ctx, weight_part, itr.p);
-    weight_attach_launcher.attach_array_soa(
-        weight_lr, weights[0]->local_allocation[local_index],
-        false /*column major*/, attach_field,
-        weights[0]->local_memory[local_index]);
-  }
-  weight_attachments[instance_index] =
-      runtime->attach_external_resources(ctx, weight_attach_launcher);
-
-  if (use_bias) {
-    // Bias should have the same bounds across all the processors
-    // so we just attach on the first one
-    AttachLauncher bias_attach_launcher(
-        LEGION_EXTERNAL_INSTANCE, bias_region, bias_region,
-        false /*restricted*/, false /*mapped*/);
-    bias_attach_launcher.attach_array_soa(
-        weights[1]->local_allocation[0], false /*column major*/, attach_field,
-        weights[1]->local_memory[0]);
-    bias_attachments[instance_index] =
-        runtime->attach_external_resource(ctx, bias_attach_launcher);
-  }
-
-  // Construct a launcher for running the inference task
-  IndexTaskLauncher& launcher = launchers[instance_index];
-  launcher = IndexTaskLauncher(
-      CONV2D_TASK_ID, launch_space, TaskArgument(NULL, 0),
-      ArgumentMap(argmaps[instance_index]), Predicate::TRUE_PRED,
-      false /*must*/, mapper, strategy->tag);
-  launcher.add_region_requirement(RegionRequirement(
-      input_part, 0 /*projection id*/, LEGION_READ_ONLY, LEGION_EXCLUSIVE,
-      input_region));
-  launcher.add_field(0, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(
-      output_part, 0 /*projection id*/, LEGION_WRITE_DISCARD, LEGION_EXCLUSIVE,
-      output_region));
-  launcher.add_field(1, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(
-      weight_part, 0 /*projection id*/, LEGION_READ_ONLY, LEGION_EXCLUSIVE,
-      weight_region));
-  launcher.add_field(2, FID_DATA);
-  if (use_bias) {
-    launcher.add_region_requirement(RegionRequirement(
-        weights[1]->region[instance_index], 0 /*projection id*/,
-        LEGION_READ_ONLY, LEGION_EXCLUSIVE,
-        weights[1]->region[instance_index]));
-    launcher.add_field(3, FID_DATA);
-  }
-}
-
-void
-Conv2D::forward(
-    LegionModelInstance* instance, const unsigned instance_index,
-    Runtime* runtime, Context ctx, MapperID mapper)
-{
-  runtime->execute_index_space(ctx, launchers[instance_index]);
-}
-
-void
-Conv2D::finalize(
-    LegionModelInstance* instance, const unsigned instance_index,
-    Runtime* runtime, Context ctx, MapperID mapper)
-{
-  runtime->detach_external_resources(ctx, weight_attachments[instance_index]);
-  if (use_bias)
-    runtime->detach_external_resource(ctx, bias_attachments[instance_index]);
-  argmaps[instance_index] = FutureMap();
-}
-
-void
-Conv2D::Free(Processor proc)
-{
-  assert(proc.kind() == strategy->kind);
-  // If this processor is not used for this layer there is nothing to do
-  if (!strategy->is_local_processor(proc))
-    return;
-  const unsigned local_index = strategy->find_local_offset(proc);
-#ifdef LEGION_USE_CUDA
-  Conv2DArgs& proc_args = args[local_index];
-  if (proc.kind() == Processor::TOC_PROC) {
-    CHECK_CUDNN(cudnnDestroyTensorDescriptor(proc_args.inputTensor));
-    CHECK_CUDNN(cudnnDestroyTensorDescriptor(proc_args.outputTensor));
-    if (use_bias) {
-      CHECK_CUDNN(cudnnDestroyTensorDescriptor(proc_args.biasTensor));
-    }
-    CHECK_CUDNN(cudnnDestroyFilterDescriptor(proc_args.filterDesc));
-    CHECK_CUDNN(cudnnDestroyActivationDescriptor(proc_args.actiDesc));
-    CHECK_CUDNN(cudnnDestroyConvolutionDescriptor(proc_args.convDesc));
-    CHECK_CUDA(cudaFree(weights[0]->local_allocation[local_index]));
-    if (use_bias) {
-      std::free(weights[1]->local_allocation[local_index]);
-      weights[1]->local_allocation[local_index] = nullptr;
-    }
-    if (proc_args.workSpaceSize > 0) {
-      for (int idx = 0; idx < MAX_NUM_INSTANCES; idx++) {
-        CHECK_CUDA(cudaFree(workspaces[idx][local_index]));
-      }
-    }
-  } else
-#endif
-  {
-    for (Weights* wts : weights) {
-      std::free(wts->local_allocation[local_index]);
-      wts->local_allocation[local_index] = nullptr;
-    }
-  }
-}
-
-/*static*/ void
-Conv2D::PreregisterTaskVariants(void)
-{
-  {
-    TaskVariantRegistrar cpu_registrar(CONV2D_TASK_ID, "Conv2D CPU");
-    cpu_registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    cpu_registrar.set_leaf();
-    Runtime::preregister_task_variant<forward_cpu>(
-        cpu_registrar, "Conv2D Operator");
-  }
-#ifdef LEGION_USE_CUDA
-  {
-    TaskVariantRegistrar gpu_registrar(CONV2D_TASK_ID, "Conv2D GPU");
-    gpu_registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
-    gpu_registrar.set_leaf();
-    Runtime::preregister_task_variant<forward_gpu>(
-        gpu_registrar, "Conv2D Operator");
-  }
-#endif
-}
-
-/*static*/ void
-Conv2D::forward_cpu(
-    const Task* task, const std::vector<PhysicalRegion>& regions, Context ctx,
-    Runtime* runtime)
-{
-  // TODO: implement this
-  assert(false);
-}
-
-#ifdef LEGION_USE_CUDA
-/*static*/ void
-Conv2D::forward_gpu(
-    const Task* task, const std::vector<PhysicalRegion>& regions, Context ctx,
-    Runtime* runtime)
-{
-  assert(task->local_arglen == sizeof(Conv2DArgs));
-  const Conv2DArgs* args = (const Conv2DArgs*)task->local_args;
-  assert(regions.size() == (3 + int(args->use_bias)));
-  assert(task->regions.size() == (3 + int(args->use_bias)));
-
-  const void* input_ptr = TensorAccessor<LEGION_READ_ONLY, 4>::access(
-      args->input_datatype, args->input_bounds, regions[0]);
-  void* output_ptr = TensorAccessor<LEGION_WRITE_DISCARD, 4>::access(
-      args->output_datatype, args->local_bounds, regions[1]);
-  const void* filter_ptr = TensorAccessor<LEGION_READ_ONLY, 4>::access(
-      args->filter_datatype, args->local_bounds, regions[2]);
-  const void* bias_ptr = NULL;
-  if (args->use_bias)
-    bias_ptr = TensorAccessor<LEGION_READ_ONLY, 1>::access(
-        args->bias_datatype, args->bias_bounds, regions[3]);
-#ifndef DISABLE_LEGION_CUDA_HIJACK
-  ::cudaStream_t stream;
-  CHECK_CUDA(cudaStreamCreate(&stream));
-  CHECK_CUDNN(cudnnSetStream(args->cudnn, stream));
-#endif
-  ::cudaEvent_t t_start, t_end;
-  if (args->profiling) {
-    CHECK_CUDA(cudaEventCreate(&t_start));
-    CHECK_CUDA(cudaEventCreate(&t_end));
-#ifdef DISABLE_LEGION_CUDA_HIJACK
-    CHECK_CUDA(cudaEventRecord(t_start));
-#else
-    CHECK_CUDA(cudaEventRecord(t_start, stream));
-#endif
-  }
-  Conv2D::forward_kernel(args, input_ptr, output_ptr, filter_ptr, bias_ptr);
-  if (args->profiling) {
-#ifdef DISABLE_LEGION_CUDA_HIJACK
-    CHECK_CUDA(cudaEventRecord(t_end));
-#else
-    CHECK_CUDA(cudaEventRecord(t_start, stream));
-#endif
-    CHECK_CUDA(cudaEventSynchronize(t_end));
-    float elapsed = 0;
-    CHECK_CUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
-    CHECK_CUDA(cudaEventDestroy(t_start));
-    CHECK_CUDA(cudaEventDestroy(t_end));
-    printf(
-        "%s [Conv2D] forward time (CF) = %.2fms\n",
-        args->owner->op_name.c_str(), elapsed);
-  }
-}
-
-/*static*/ void
-Conv2D::forward_kernel(
-    const Conv2DArgs* args, const void* input_ptr, void* output_ptr,
-    const void* filter_ptr, const void* bias_ptr)
-{
-  float alpha = 1.0f, beta = 0.0f;
-  CHECK_CUDNN(cudnnConvolutionForward(
-      args->cudnn, &alpha, args->inputTensor, input_ptr, args->filterDesc,
-      filter_ptr, args->convDesc, args->fwdAlgo, args->workSpace,
-      args->workSpaceSize, &beta, args->outputTensor, output_ptr));
-
-  if (bias_ptr != NULL) {
-    CHECK_CUDNN(cudnnAddTensor(
-        args->cudnn, &alpha, args->biasTensor, bias_ptr, &alpha,
-        args->outputTensor, output_ptr));
-  }
-  if (args->relu) {
-    CHECK_CUDNN(cudnnActivationForward(
-        args->cudnn, args->actiDesc, &alpha, args->outputTensor, output_ptr,
-        &beta, args->outputTensor, output_ptr));
-  }
-}
-#endif
-
-}}}  // namespace triton::backend::legion
diff --git a/triton/src/operators/conv2d.h b/triton/src/operators/conv2d.h
deleted file mode 100644
index 96e8871916..0000000000
--- a/triton/src/operators/conv2d.h
+++ /dev/null
@@ -1,133 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __LEGION_TRITON_CONV2D_H__
-#define __LEGION_TRITON_CONV2D_H__
-
-#include "operator.h"
-#include "tensor.h"
-#ifdef LEGION_USE_CUDA
-#include "cudahelp.h"
-#endif
-
-namespace triton { namespace backend { namespace legion {
-
-struct Conv2DArgs : public OperatorArgs {
- public:
-  Conv2DArgs(bool rel = true, bool bias = true)
-      : OperatorArgs(), local_index(0), relu(rel), use_bias(bias)
-#ifndef LEGION_USE_CUDA
-  {
-  }
-#else
-  {
-    workSpace = nullptr;
-    workSpaceSize = 0;
-  }
-  cudnnHandle_t cudnn;
-  cudnnTensorDescriptor_t inputTensor, biasTensor, outputTensor;
-  cudnnFilterDescriptor_t filterDesc;
-  cudnnActivationDescriptor_t actiDesc;
-  cudnnConvolutionDescriptor_t convDesc;
-  cudnnConvolutionFwdAlgo_t fwdAlgo;
-  void* workSpace;  // device workspace pointer
-  size_t workSpaceSize;
-#endif
-  Legion::Rect<4> input_bounds;
-  Legion::Rect<4> local_bounds;
-  Legion::Rect<1> bias_bounds;
-  DataType input_datatype;
-  DataType output_datatype;
-  DataType filter_datatype;
-  DataType bias_datatype;
-  unsigned local_index;
-  bool relu, use_bias;
-};
-
-class Conv2D : public Operator {
- public:
-  Conv2D(
-      LegionModelState* model, const LayerStrategy* strategy, size_t inChannels,
-      size_t outChannels, size_t kernelH, size_t kernelW, size_t strideH,
-      size_t strideW, size_t paddingH, size_t paddingW,
-      ActivationMode activation, size_t groups, bool use_bias,
-      const char* name);
-  virtual ~Conv2D(void);
-
- public:
-  void Configure(
-      Tensor* input, Weights* weights, Tensor* output, Weights* bias = NULL);
-  Legion::Rect<4> GetInputBounds(Realm::Processor proc);
-  Legion::Rect<4> GetWeightBounds(Realm::Processor proc);
-  Legion::Rect<1> GetBiasBounds(Realm::Processor proc);
-  Legion::Rect<4> GetOutputBounds(Realm::Processor proc);
-
- public:
-  virtual void Load(Realm::Processor processor) override;
-  virtual void initialize(
-      LegionModelInstance* instance, const unsigned instance_index,
-      Legion::Runtime* runtime, Legion::Context ctx,
-      Legion::MapperID mapper) override;
-  virtual void forward(
-      LegionModelInstance* instance, const unsigned instance_index,
-      Legion::Runtime* runtime, Legion::Context ctx,
-      Legion::MapperID mapper) override;
-  virtual void finalize(
-      LegionModelInstance* instance, const unsigned instance_index,
-      Legion::Runtime* runtime, Legion::Context ctx,
-      Legion::MapperID mapper) override;
-  virtual void Free(Realm::Processor processor) override;
-
- public:
-  static void PreregisterTaskVariants(void);
-  static void forward_cpu(
-      const Legion::Task* task,
-      const std::vector<Legion::PhysicalRegion>& regions, Legion::Context ctx,
-      Legion::Runtime* runtime);
-#ifdef LEGION_USE_CUDA
- public:
-  // Forward task for the GPU
-  static void forward_gpu(
-      const Legion::Task* task,
-      const std::vector<Legion::PhysicalRegion>& regions, Legion::Context ctx,
-      Legion::Runtime* runtime);
-
- protected:
-  static void forward_kernel(
-      const Conv2DArgs* args, const void* input_ptr, void* output_ptr,
-      const void* filter_ptr, const void* bias_ptr);
-#endif
- public:
-  const ActivationMode activation;
-  const size_t in_channels, out_channels, kernel_h, kernel_w;
-  const size_t stride_h, stride_w, padding_h, padding_w, groups;
-  const bool use_bias;
-
- protected:
-  Legion::DomainTransform input_transform;
-  Legion::Domain input_extent;
-  Conv2DArgs args[MAX_LOCAL_PROCS];
-  Legion::FutureMap argmaps[MAX_NUM_INSTANCES];
-  Legion::IndexTaskLauncher launchers[MAX_NUM_INSTANCES];
-  Legion::ExternalResources weight_attachments[MAX_NUM_INSTANCES];
-  Legion::PhysicalRegion bias_attachments[MAX_NUM_INSTANCES];
-#ifdef LEGION_USE_CUDA
-  void* workspaces[MAX_NUM_INSTANCES][MAX_LOCAL_PROCS];
-#endif
-};
-
-}}}  // namespace triton::backend::legion
-
-#endif  // __LEGION_TRITON_CONV2D_H__
diff --git a/triton/src/operators/flat.h b/triton/src/operators/flat.h
deleted file mode 100644
index 30c75fac5f..0000000000
--- a/triton/src/operators/flat.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __LEGION_TRITON_FLAT_H__
-#define __LEGION_TRITON_FLAT_H__
-
-#include "operator.h"
-#include "tensor.h"
-
-namespace triton { namespace backend { namespace legion {
-
-struct FlatArgs : public OperatorArgs {
- public:
-};
-
-class Flat : public Operator {
- public:
-  Flat(LegionModelState* state, const char* name);
-
-  void configure(Tensor* input, Tensor* output);
-
-  virtual void initialize(LegionModelInstance* instance);
-  virtual void forward(LegionModelInstance* instance);
-  virtual void finalize(LegionModelInstance* instance);
-
-  static FlatArgs initalize_gpu(
-      const Legion::Task* task,
-      const std::vector<Legion::PhysicalRegion>& regions, Legion::Context ctx,
-      Legion::Runtime* runtime);
-  static void forward_gpu(
-      const Legion::Task* task,
-      const std::vector<Legion::PhysicalRegion>& regions, Legion::Context ctx,
-      Legion::Runtime* runtime);
-  static void forward_kernel(
-      const FlatArgs* args, const void* input_ptr, void* output_ptr,
-      size_t num_elements);
-
- public:
-  LegionModelState* const model;
-};
-
-}}}  // namespace triton::backend::legion
-
-#endif  // __LEGION_TRITON_FLAT_H__
diff --git a/triton/src/operators/linear.h b/triton/src/operators/linear.h
deleted file mode 100644
index 11b5a95e1f..0000000000
--- a/triton/src/operators/linear.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __LEGION_TRITON_LINEAR_H__
-#define __LEGION_TRITON_LINEAR_H__
-
-#include "operator.h"
-#include "tensor.h"
-
-namespace triton { namespace backend { namespace legion {
-
-struct LinearArgs : public OperatorArgs {
- public:
-  LinearArgs(size_t batch_size);
-  cudnnTensorDescriptor_t outputTensor;
-  cudnnActivationDescriptor_t actiDesc;
-  ActivationMode activation;
-  bool use_bias;
-};
-
-class Linear : public Operator {
- public:
-  Linear(
-      LegionModelState* model, unsigned out_dim, ActivationMode activation,
-      bool use_bias, const char* name);
-
-  void configure(
-      Tensor* input, Weights* weights, Tensor* output, Weights* bias = NULL);
-
-  virtual void initialize(LegionModelInstance* instance);
-  virtual void forward(LegionModelInstance* instance);
-  virtual void finalize(LegionModelInstance* instance);
-
-  static LinearArgs initialize_gpu(
-      const Legion::Task* task,
-      const std::vector<Legion::PhysicalRegion>& regions, Legion::Context ctx,
-      Legion::Runtime* runtime);
-  static void forward_gpu(
-      const Legion::Task* task,
-      const std::vector<Legion::PhysicalRegion>& regions, Legion::Context ctx,
-      Legion::Runtime* runtime);
-  static void forward_kernel(
-      const LinearArgs* args, const void* input_ptr, void* output_ptr,
-      const void* filter_ptr, const void* bias_ptr, unsigned in_dim,
-      unsigned out_dim, size_t batch_size);
-
- public:
-  LegionModelState* const model;
-  const unsigned in_channels, out_channels;
-  const bool use_bias;
-};
-
-}}}  // namespace triton::backend::legion
-
-#endif  // __LEGION_TRITON_LINEAR_H__
diff --git a/triton/src/operators/matmul.cc b/triton/src/operators/matmul.cc
deleted file mode 100644
index e972665adf..0000000000
--- a/triton/src/operators/matmul.cc
+++ /dev/null
@@ -1,913 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "matmul.h"
-
-using namespace Legion;
-
-namespace triton { namespace backend { namespace legion {
-
-MatMulProjectionFunctor::MatMulProjectionFunctor(
-    ProjectionID id, const DomainTransform& trans)
-    : ProjectionFunctor(), functor_id(id), domain_transform(trans)
-{
-}
-
-LogicalRegion
-MatMulProjectionFunctor::project(
-    LogicalPartition upper_bound, const DomainPoint& point,
-    const Domain& domain)
-{
-  return runtime->get_logical_subregion_by_color(upper_bound, transform(point));
-}
-
-MatMulArgs::MatMulArgs(void) {}
-
-MatMul::MatMul(
-    LegionModelState* model, const LayerStrategy* strategy, const char* name)
-    : Operator(model, strategy, OperatorType::OP_MATMUL, name, 2, 0, 1),
-      in1_proj(nullptr), in2_proj(nullptr)
-{
-}
-
-template <unsigned DIM>
-void
-MatMul::compute_in1_parameters(Tensor* in1, Tensor* out)
-{
-  Rect<DIM> extent, colors;
-  Transform<DIM, DIM> transform;
-  for (int i = 0; i < DIM; i++)
-    for (int j = 0; j < DIM; j++) transform[i][j] = 0;
-  assert(out->bounds.size() >= in1->bounds.size());
-  size_t dimoff = out->bounds.size() - in1->bounds.size();
-  for (int i = 0; i < DIM; i++) {
-    extent.lo[i] = 0;
-    colors.lo[i] = 0;
-    if (i == (DIM - 1)) {
-      /* need the whole dimension */
-      extent.hi[i] = in1->bounds[i] - 1; /*inclusive*/
-      colors.hi[i] = 0;
-    } else if (in1->bounds[i] == 1) {
-      extent.hi[i] = 0;
-      colors.hi[i] = 0;
-    } else {
-      size_t pieces = strategy->dim[dimoff + 1];
-      size_t chunks = (in1->bounds[i] + pieces - 1) / pieces;
-      extent.hi[i] = chunks - 1; /*inclusive*/
-      colors.hi[i] = pieces - 1; /*inclusive*/
-    }
-  }
-  in1_transform = transform;
-  in1_extent = extent;
-  in1_colors = colors;
-}
-
-template <unsigned DIM>
-void
-MatMul::compute_in2_parameters(Tensor* in2, Tensor* out)
-{
-  Rect<DIM> extent, colors;
-  Transform<DIM, DIM> transform;
-  for (int i = 0; i < DIM; i++)
-    for (int j = 0; j < DIM; j++) transform[i][j] = 0;
-  assert(out->bounds.size() >= in2->bounds.size());
-  size_t dimoff = out->bounds.size() - in2->bounds.size();
-  for (int i = 0; i < DIM; i++) {
-    extent.lo[i] = 0;
-    colors.lo[i] = 0;
-    if (i == (DIM - 2)) {
-      /* need the whole dimension */
-      extent.hi[i] = in2->bounds[i] - 1; /*inclusive*/
-      colors.hi[i] = 0;
-    } else if (in2->bounds[i] == 1) {
-      extent.hi[i] = 0;
-      colors.hi[i] = 0;
-    } else {
-      size_t pieces = strategy->dim[dimoff + i];
-      size_t chunks = (in2->bounds[i] + pieces - 1) / pieces;
-      extent.hi[i] = chunks - 1; /*inclusive*/
-      colors.hi[i] = pieces - 1; /*inclusive*/
-    }
-  }
-  in2_transform = transform;
-  in2_extent = extent;
-  in2_colors = colors;
-}
-
-void
-MatMul::Configure(Tensor* in1, Tensor* in2, Tensor* out)
-{
-  assert(in1 != nullptr);
-  assert(in2 != nullptr);
-  assert(out != nullptr);
-  inputs.push_back(in1);
-  inputs.push_back(in2);
-  outputs.push_back(out);
-
-  if ((in1->bounds.size() == 1) && (in2->bounds.size() == 1)) {
-    fprintf(stderr, "TODO: support for dot-product in matmul operator");
-    abort();
-  } else if (in1->bounds.size() == 1) {
-    const size_t in2_dim = in2->bounds.size();
-    const size_t out_dim = out->bounds.size();
-    assert(in2_dim >= 2);
-    assert(out_dim >= 1);
-    const size_t n = out->bounds[out_dim - 1];
-    const size_t k = in1->bounds[0];
-    assert(in2->bounds[in2_dim - 2] == k);
-    assert(in2->bounds[in2_dim - 1] == n);
-    // make sure all the other dimensions align or broadcast
-    unsigned in2_broadcasts = 0;
-    for (unsigned off = 3; off <= out_dim; off++) {
-      const size_t out_size = out->bounds[out_dim - off];
-      if (off <= in2_dim) {
-        const size_t size = in2->bounds[in2_dim - off];
-        assert((size == 1) || (size == out_size));
-        if (size == 1)
-          in2_broadcasts |= (1 << (off - 3));
-      }
-    }
-    FunctorKey in1_key(out->bounds.size(), 1, 0);
-    FunctorTable::const_iterator finder = in1_functors.find(in1_key);
-    assert(finder != in1_functors.end());
-    in1_proj = finder->second;
-
-    FunctorKey in2_key(out->bounds.size(), in2->bounds.size(), in2_broadcasts);
-    finder = in2_functors.find(in2_key);
-    assert(finder != in2_functors.end());
-    in2_proj = finder->second;
-
-    Rect<1> extent, colors;
-    Transform<1, 1> transform;
-    transform[0][0] = 0;
-    extent.lo[0] = 0;
-    extent.hi[0] = in1->bounds[0] - 1;  // inclusive
-    colors.lo[0] = 0;
-    colors.hi[0] = 0;
-    in1_transform = transform;
-    in1_extent = extent;
-    in1_colors = colors;
-
-    switch (in2->bounds.size()) {
-#define DIMFUNC(DIM)                       \
-  case DIM: {                              \
-    compute_in2_parameters<DIM>(in2, out); \
-    break;                                 \
-  }
-      LEGION_FOREACH_N(DIMFUNC)
-#undef DIMFUNC
-      default:
-        abort();
-    }
-  } else if (in2->bounds.size() == 1) {
-    const size_t in1_dim = in1->bounds.size();
-    const size_t out_dim = out->bounds.size();
-    assert(in1_dim >= 2);
-    const size_t m = (out_dim > 1) ? out->bounds[out_dim - 2] : 1;
-    assert(out->bounds[out_dim - 1] == 1);
-    assert(in1->bounds[in1_dim - 2] == m);
-    const size_t k = in1->bounds[in1_dim - 1];
-    assert(in2->bounds[in2->bounds[0]] == k);
-    // make sure all the other dimensions align or broadcast
-    unsigned in1_broadcasts = 0;
-    for (unsigned off = 3; off <= out_dim; off++) {
-      const size_t out_size = out->bounds[out_dim - off];
-      if (off <= in1_dim) {
-        const size_t size = in1->bounds[in1_dim - off];
-        assert((size == 1) || (size == out_size));
-        if (size == 1)
-          in1_broadcasts |= (1 << (off - 3));
-      }
-    }
-    FunctorKey in1_key(out->bounds.size(), in1->bounds.size(), in1_broadcasts);
-    FunctorTable::const_iterator finder = in1_functors.find(in1_key);
-    assert(finder != in1_functors.end());
-    in1_proj = finder->second;
-
-    FunctorKey in2_key(out->bounds.size(), 1, 0);
-    finder = in2_functors.find(in2_key);
-    assert(finder != in2_functors.end());
-    in2_proj = finder->second;
-
-    switch (in1->bounds.size()) {
-#define DIMFUNC(DIM)                       \
-  case DIM: {                              \
-    compute_in1_parameters<DIM>(in1, out); \
-    break;                                 \
-  }
-      LEGION_FOREACH_N(DIMFUNC)
-#undef DIMFUNC
-      default:
-        abort();
-    }
-
-    Rect<1> extent, colors;
-    Transform<1, 1> transform;
-    transform[0][0] = 0;
-    extent.lo[0] = 0;
-    extent.hi[0] = in2->bounds[0] - 1;  // inclusive
-    colors.lo[0] = 0;
-    colors.hi[0] = 0;
-    in2_transform = transform;
-    in2_extent = extent;
-    in2_colors = colors;
-
-  } else {
-    // all tensors have at least two dimensions
-    const size_t in1_dim = in1->bounds.size();
-    const size_t in2_dim = in2->bounds.size();
-    const size_t out_dim = out->bounds.size();
-    assert(in1_dim >= 2);
-    assert(in2_dim >= 2);
-    assert(out_dim >= 2);
-    const size_t m = out->bounds[out_dim - 2];
-    const size_t n = out->bounds[out_dim - 1];
-    assert(in1->bounds[in1_dim - 2] == m);
-    const size_t k = in1->bounds[in1_dim - 1];
-    assert(in2->bounds[in2_dim - 2] == k);
-    assert(in2->bounds[in2_dim - 1] == n);
-    // make sure all the other dimensions align or can broadcast
-    unsigned in1_broadcasts = 0, in2_broadcasts = 0;
-    for (unsigned off = 3; off <= out_dim; off++) {
-      const size_t out_size = out->bounds[out_dim - off];
-      if (off <= in1_dim) {
-        const size_t size = in1->bounds[in1_dim - off];
-        assert((size == 1) || (size == out_size));
-        if (size == 1)
-          in1_broadcasts |= (1 << (off - 3));
-      }
-      if (off <= in2_dim) {
-        const size_t size = in2->bounds[in2_dim - off];
-        assert((size == 1) || (size == out_size));
-        if (size == 1)
-          in2_broadcasts |= (1 << (off - 3));
-      }
-    }
-    FunctorKey in1_key(out->bounds.size(), in1->bounds.size(), in1_broadcasts);
-    FunctorTable::const_iterator finder = in1_functors.find(in1_key);
-    assert(finder != in1_functors.end());
-    in1_proj = finder->second;
-
-    FunctorKey in2_key(out->bounds.size(), in2->bounds.size(), in2_broadcasts);
-    finder = in2_functors.find(in2_key);
-    assert(finder != in2_functors.end());
-    in2_proj = finder->second;
-
-    // Finally fill in the input transforms, extents, and colors for the inputs
-    switch (in1->bounds.size()) {
-#define DIMFUNC(DIM)                       \
-  case DIM: {                              \
-    compute_in1_parameters<DIM>(in1, out); \
-    break;                                 \
-  }
-      LEGION_FOREACH_N(DIMFUNC)
-#undef DIMFUNC
-      default:
-        abort();
-    }
-
-    switch (in2->bounds.size()) {
-#define DIMFUNC(DIM)                       \
-  case DIM: {                              \
-    compute_in2_parameters<DIM>(in2, out); \
-    break;                                 \
-  }
-      LEGION_FOREACH_N(DIMFUNC)
-#undef DIMFUNC
-      default:
-        abort();
-    }
-  }
-}
-
-Domain
-MatMul::GetIn1Bounds(Processor proc)
-{
-  assert(in1_proj != nullptr);
-  const DomainPoint point = strategy->find_local_point(proc);
-  const DomainPoint local = in1_proj->transform(point);
-  const DomainPoint offset = in1_transform * local;
-  switch (inputs[0]->bounds.size()) {
-#define DIMFUNC(DIM)                                                   \
-  case DIM: {                                                          \
-    Point<DIM> off = offset;                                           \
-    Rect<DIM> extent = in1_extent;                                     \
-    Rect<DIM> bounds(extent.lo + off, extent.hi + off);                \
-    Point<DIM> upper;                                                  \
-    for (int i = 0; i < DIM; i++) upper[i] = inputs[0]->bounds[i] - 1; \
-    Rect<DIM> full(Point<DIM>::ZEROES(), upper);                       \
-    Rect<DIM> result = full.intersection(bounds);                      \
-    return Domain(result);                                             \
-  }
-    LEGION_FOREACH_N(DIMFUNC)
-#undef DIMFUNC
-    default:
-      abort();
-  }
-  return Domain();
-}
-
-Domain
-MatMul::GetIn2Bounds(Processor proc)
-{
-  assert(in2_proj != nullptr);
-  const DomainPoint point = strategy->find_local_point(proc);
-  const DomainPoint local = in2_proj->transform(point);
-  const DomainPoint offset = in2_transform * local;
-  switch (inputs[1]->bounds.size()) {
-#define DIMFUNC(DIM)                                                   \
-  case DIM: {                                                          \
-    Point<DIM> off = offset;                                           \
-    Rect<DIM> extent = in2_extent;                                     \
-    Rect<DIM> bounds(extent.lo + off, extent.hi + off);                \
-    Point<DIM> upper;                                                  \
-    for (int i = 0; i < DIM; i++) upper[i] = inputs[1]->bounds[i] - 1; \
-    Rect<DIM> full(Point<DIM>::ZEROES(), upper);                       \
-    Rect<DIM> result = full.intersection(bounds);                      \
-    return Domain(result);                                             \
-  }
-    LEGION_FOREACH_N(DIMFUNC)
-#undef DIMFUNC
-    default:
-      abort();
-  }
-  return Domain();
-}
-
-Domain
-MatMul::GetOutBounds(Processor proc)
-{
-  assert(outputs[0]->bounds.size() == size_t(strategy->nDims));
-  const size_t dims = outputs[0]->bounds.size();
-  DomainPoint lo, hi;
-  lo.dim = dims;
-  hi.dim = dims;
-  for (int d = 0; d < dims; d++) {
-    lo[d] = 0;
-    hi[d] = outputs[0]->bounds[d] - 1;
-  }
-  const Domain global(lo, hi);
-  return strategy->find_local_domain(proc, global);
-}
-
-void
-MatMul::Load(Processor proc)
-{
-  assert(proc.kind() == strategy->kind);
-  // If this processor is not used for this layer there is nothing to do
-  if (!strategy->is_local_processor(proc))
-    return;
-  const unsigned local_index = strategy->find_local_offset(proc);
-  MatMulArgs& proc_args = args[local_index];
-  proc_args.owner = this;
-  proc_args.in1_bounds = GetIn1Bounds(proc);
-  proc_args.in2_bounds = GetIn2Bounds(proc);
-  proc_args.out_bounds = GetOutBounds(proc);
-  proc_args.in1_datatype = inputs[0]->type;
-  proc_args.in2_datatype = inputs[1]->type;
-  proc_args.out_datatype = outputs[0]->type;
-#ifdef LEGION_USE_CUDA
-  if (proc.kind() == Processor::TOC_PROC)
-    proc_args.cublas = model->runtime_->cublas[local_index];
-#endif
-}
-
-void
-MatMul::initialize(
-    LegionModelInstance* instance, const unsigned instance_index,
-    Runtime* runtime, Context ctx, MapperID mapper)
-{
-  const Domain launch_domain = strategy->get_launch_domain();
-  // Find or create the launch space domain
-  IndexSpace launch_space = instance->find_or_create_index_space(launch_domain);
-  // Also get the sharding function from the strategy
-  ShardingFunction* shardfn = strategy->sharding_function;
-  // Construct a future map for the pass-by-value arguments
-  std::map<DomainPoint, TaskArgument> values;
-  for (Domain::DomainPointIterator itr(launch_domain); itr; itr++) {
-    const Processor proc = shardfn->find_proc(itr.p, launch_domain);
-    if (!strategy->is_local_processor(proc))
-      continue;
-    const unsigned local_index = strategy->find_local_offset(proc);
-    values[itr.p] = TaskArgument(args + local_index, sizeof(MatMulArgs));
-  }
-  argmaps[instance_index] = runtime->construct_future_map(
-      ctx, launch_space, values, true /*collective*/, shardfn->sharding_id);
-
-  IndexTaskLauncher& launcher = launchers[instance_index];
-  launcher = IndexTaskLauncher(
-      MATMUL_TASK_ID, launch_space, TaskArgument(NULL, 0),
-      ArgumentMap(argmaps[instance_index]), Predicate::TRUE_PRED,
-      false /*must*/, mapper, strategy->tag);
-  // Create partition for the output region
-  LogicalRegion output_region = instance->create_tensor_region(outputs[0]);
-  LogicalPartition output_part =
-      instance->find_or_create_tiled_partition(outputs[0], strategy);
-  launcher.add_region_requirement(RegionRequirement(
-      output_part, 0 /*projection id*/, LEGION_WRITE_DISCARD, LEGION_EXCLUSIVE,
-      output_region));
-  launcher.add_field(0, FID_DATA);
-  // Create partition for the input regions
-  LogicalRegion in1_region = inputs[0]->region[instance_index];
-  IndexSpace in1_colorspace = instance->find_or_create_index_space(in1_colors);
-  IndexPartition index1_part = instance->find_or_create_partition(
-      in1_region.get_index_space(), in1_colorspace, in1_transform, in1_extent,
-      LEGION_DISJOINT_COMPLETE_KIND);
-  LogicalPartition in1_part = runtime->get_logical_partition_by_tree(
-      ctx, index1_part, in1_region.get_field_space(), in1_region.get_tree_id());
-  launcher.add_region_requirement(RegionRequirement(
-      in1_part, in1_proj->functor_id, LEGION_READ_ONLY, LEGION_EXCLUSIVE,
-      in1_region));
-  launcher.add_field(1, FID_DATA);
-  LogicalRegion in2_region = inputs[1]->region[instance_index];
-  IndexSpace in2_colorspace = instance->find_or_create_index_space(in2_colors);
-  IndexPartition index2_part = instance->find_or_create_partition(
-      in2_region.get_index_space(), in2_colorspace, in2_transform, in2_extent,
-      LEGION_DISJOINT_COMPLETE_KIND);
-  LogicalPartition in2_part = runtime->get_logical_partition_by_tree(
-      ctx, index2_part, in2_region.get_field_space(), in2_region.get_tree_id());
-  launcher.add_region_requirement(RegionRequirement(
-      in2_part, in2_proj->functor_id, LEGION_READ_ONLY, LEGION_EXCLUSIVE,
-      in2_region));
-  launcher.add_field(2, FID_DATA);
-}
-
-void
-MatMul::forward(
-    LegionModelInstance* instance, const unsigned instance_index,
-    Runtime* runtime, Context ctx, MapperID mapper)
-{
-  runtime->execute_index_space(ctx, launchers[instance_index]);
-}
-
-void
-MatMul::finalize(
-    LegionModelInstance* instance, const unsigned instance_index,
-    Runtime* runtime, Context ctx, MapperID mapper)
-{
-  argmaps[instance_index] = FutureMap();
-}
-
-void
-MatMul::Free(Processor proc)
-{
-  // Nothing to do in this case
-}
-
-/*static instantiations*/
-MatMul::FunctorTable MatMul::in1_functors;
-MatMul::FunctorTable MatMul::in2_functors;
-
-template <unsigned IDIM, unsigned ODIM>
-/*static*/ void
-MatMul::generate_specific_functors(void)
-{
-  assert(ODIM <= IDIM);
-  // Enumerate all the combinations of bit masks for broadcasting
-  const unsigned combinations = (ODIM >= 2) ? 1 << (ODIM - 2) : 1;
-  for (unsigned idx = 0; idx < combinations; idx++) {
-    const FunctorKey key(IDIM, ODIM, idx);
-    // Input1 case: partition on ODIM-2 but broadcast on ODIM-1
-    {
-      Transform<ODIM, IDIM> transform;
-      // Initialize everything to zeros to start
-      for (int i = 0; i < ODIM; i++)
-        for (int j = 0; j < IDIM; j++) transform[i][j] = 0;
-      // Work backwards for broadcasting
-      for (int off = 1; off <= ODIM; off++) {
-        if (off == 1) {
-          // broadcast
-          transform[ODIM - off][IDIM - off] = 0;
-        } else if (off == 2) {
-          // use partition
-          transform[ODIM - off][IDIM - off] = 1;
-        } else {
-          // check for broadcast
-          transform[ODIM - off][IDIM - off] = (idx & (1 << (idx - 3))) ? 0 : 1;
-        }
-      }
-      DomainTransform domain_transform(transform);
-      ProjectionID id = Runtime::generate_static_projection_id();
-      MatMulProjectionFunctor* functor =
-          new MatMulProjectionFunctor(id, domain_transform);
-      Runtime::preregister_projection_functor(id, functor);
-      assert(in1_functors.find(key) == in1_functors.end());
-      in1_functors[key] = functor;
-    }
-    // Input2 case: broadcast on ODIM-2 but partition on ODIM-1
-    {
-      Transform<ODIM, IDIM> transform;
-      // Initialize everything to zeros to start
-      for (int i = 0; i < ODIM; i++)
-        for (int j = 0; j < IDIM; j++) transform[i][j] = 0;
-      // Work backwards for broadcasting
-      for (int off = 1; off <= ODIM; off++) {
-        if (off == 1) {
-          // use partition (unless we're a vector so we're broadcasting)
-          transform[ODIM - off][IDIM - off] = (ODIM == 1) ? 0 : 1;
-        } else if (off == 2) {
-          // broadcast
-          transform[ODIM - off][IDIM - off] = 0;
-        } else {
-          // check for broadcast
-          transform[ODIM - off][IDIM - off] = (idx & (1 << (idx - 3))) ? 0 : 1;
-        }
-      }
-      DomainTransform domain_transform(transform);
-      ProjectionID id = Runtime::generate_static_projection_id();
-      MatMulProjectionFunctor* functor =
-          new MatMulProjectionFunctor(id, domain_transform);
-      Runtime::preregister_projection_functor(id, functor);
-      assert(in2_functors.find(key) == in2_functors.end());
-      in2_functors[key] = functor;
-    }
-  }
-}
-
-template <unsigned IDIM>
-/*static*/ void
-MatMul::generate_all_functors(void)
-{
-  for (int i = 1; i <= IDIM; i++) {
-    switch (i) {
-#define DIMFUNC(DIM)                         \
-  case DIM: {                                \
-    generate_specific_functors<IDIM, DIM>(); \
-    break;                                   \
-  }
-      LEGION_FOREACH_N(DIMFUNC)
-#undef DIMFUNC
-      default:
-        abort();
-    }
-  }
-}
-
-/*static*/ void
-MatMul::PreregisterTaskVariants(void)
-{
-  // Create all possible functors we might need here so these data
-  // structures can be read-only after this point
-  for (int i = 2; i <= LEGION_MAX_DIM; i++) {
-    switch (i) {
-#define DIMFUNC(DIM)              \
-  case DIM: {                     \
-    generate_all_functors<DIM>(); \
-    break;                        \
-  }
-      LEGION_FOREACH_N(DIMFUNC)
-#undef DIMFUNC
-      default:
-        abort();
-    }
-  }
-  {
-    TaskVariantRegistrar cpu_registrar(MATMUL_TASK_ID, "MatMul CPU");
-    cpu_registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    cpu_registrar.set_leaf();
-    Runtime::preregister_task_variant<forward_cpu>(
-        cpu_registrar, "MatMul Operator");
-  }
-#ifdef LEGION_USE_CUDA
-  {
-    TaskVariantRegistrar gpu_registrar(MATMUL_TASK_ID, "MatMul GPU");
-    gpu_registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
-    gpu_registrar.set_leaf();
-    Runtime::preregister_task_variant<forward_gpu>(
-        gpu_registrar, "MatMul Operator");
-  }
-#endif
-}
-
-/*static*/ void
-MatMul::forward_cpu(
-    const Task* task, const std::vector<PhysicalRegion>& regions, Context ctx,
-    Runtime* runtime)
-{
-  // TODO: implement this with OpenBLAS or something like it
-  abort();
-}
-
-#ifdef LEGION_USE_CUDA
-/*static*/ void
-MatMul::forward_gpu(
-    const Task* task, const std::vector<PhysicalRegion>& regions, Context ctx,
-    Runtime* runtime)
-{
-  assert(task->local_arglen == sizeof(MatMulArgs));
-  const MatMulArgs* args = (const MatMulArgs*)task->local_args;
-#ifndef DISABLE_LEGION_CUDA_HIJACK
-  ::cudaStream_t stream;
-  CHECK_CUDA(cudaStreamCreate(&stream));
-  CHECK_CUBLAS(cublasSetStream(args->cublas, stream));
-#endif
-  ::cudaEvent_t t_start, t_end;
-  if (args->profiling) {
-    CHECK_CUDA(cudaEventCreate(&t_start));
-    CHECK_CUDA(cudaEventCreate(&t_end));
-#ifdef DISABLE_LEGION_CUDA_HIJACK
-    CHECK_CUDA(cudaEventRecord(t_start));
-#else
-    CHECK_CUDA(cudaEventRecord(t_start, stream));
-#endif
-  }
-  assert(regions.size() == 3);
-  assert(task->regions.size() == 3);
-  // cublas is dumb and doesn't support row-major, so reverse the matrix
-  // order to help cublas think things are column-major
-  // effectively we get NxM = NxK * KxM
-  uint8_t* out_ptr = nullptr;
-  size_t m, n, k, batch_count = 1;
-  size_t lda, ldb, ldc, astride, bstride, cstride;
-  switch (args->out_bounds.get_dim()) {
-#define DIMFUNC(DIM)                                                       \
-  case DIM: {                                                              \
-    const Rect<DIM> bounds = args->out_bounds;                             \
-    out_ptr = (uint8_t*)TensorAccessor<LEGION_WRITE_DISCARD, DIM>::access( \
-        args->out_datatype, bounds, regions[0]);                           \
-    for (int i = 0; i < (DIM - 2); i++)                                    \
-      batch_count *= ((bounds.hi[i] - bounds.lo[i]) + 1);                  \
-    if (DIM == 1) {                                                        \
-      assert(                                                              \
-          (args->in1_bounds.get_dim() == 1) ||                             \
-          (args->in2_bounds.get_dim() == 1));                              \
-      if (args->in1_bounds.get_dim() == 1) {                               \
-        n = 1;                                                             \
-        m = (bounds.hi[0] - bounds.lo[0]) + 1;                             \
-      } else {                                                             \
-        n = (bounds.hi[0] - bounds.lo[0]) + 1;                             \
-        m = 1;                                                             \
-      }                                                                    \
-    } else {                                                               \
-      n = (bounds.hi[DIM - 2] - bounds.lo[DIM - 2]) + 1;                   \
-      m = (bounds.hi[DIM - 1] - bounds.lo[DIM - 1]) + 1;                   \
-    }                                                                      \
-    ldc = m;                                                               \
-    cstride = m * n;                                                       \
-    break;                                                                 \
-  }
-    LEGION_FOREACH_N(DIMFUNC)
-#undef DIMFUNC
-    default:
-      abort();
-  }
-  bool has_broadcast = false;
-  const uint8_t *in1_ptr = nullptr, *in2_ptr = nullptr;
-  switch (args->in1_bounds.get_dim()) {
-#define DIMFUNC(DIM)                                                         \
-  case DIM: {                                                                \
-    const Rect<DIM> bounds = args->in1_bounds;                               \
-    in1_ptr = (const uint8_t*)TensorAccessor<LEGION_READ_ONLY, DIM>::access( \
-        args->in1_datatype, bounds, regions[1]);                             \
-    k = (bounds.hi[DIM - 1] - bounds.lo[DIM - 1]) + 1;                       \
-    ldb = (DIM == 1) ? 1 : k;                                                \
-    if (DIM == 1)                                                            \
-      bstride = k;                                                           \
-    else                                                                     \
-      bstride = k * ((bounds.hi[DIM - 2] - bounds.lo[DIM - 2]) + 1);         \
-    if (!has_broadcast) {                                                    \
-      if (DIM == args->out_bounds.get_dim()) {                               \
-        const Rect<DIM> out_bounds = args->out_bounds;                       \
-        for (int i = 0; i < (DIM - 2); i++) {                                \
-          if ((bounds.hi[i] > 0) || (out_bounds.hi[i] == 0))                 \
-            continue;                                                        \
-          has_broadcast = true;                                              \
-          break;                                                             \
-        }                                                                    \
-      } else {                                                               \
-        has_broadcast = true;                                                \
-      }                                                                      \
-    }                                                                        \
-    break;                                                                   \
-  }
-    LEGION_FOREACH_N(DIMFUNC)
-#undef DIMFUNC
-    default:
-      abort();
-  }
-  switch (args->in2_bounds.get_dim()) {
-#define DIMFUNC(DIM)                                                         \
-  case DIM: {                                                                \
-    const Rect<DIM> bounds = args->in2_bounds;                               \
-    in2_ptr = (const uint8_t*)TensorAccessor<LEGION_READ_ONLY, DIM>::access( \
-        args->in2_datatype, bounds, regions[2]);                             \
-    lda = (bounds.hi[DIM - 1] - bounds.lo[DIM - 1]) + 1;                     \
-    if (DIM == 1)                                                            \
-      astride = lda;                                                         \
-    else                                                                     \
-      astride = lda * ((bounds.hi[DIM - 2] - bounds.lo[DIM - 2]) + 1);       \
-    if (!has_broadcast) {                                                    \
-      if (DIM == args->out_bounds.get_dim()) {                               \
-        const Rect<DIM> out_bounds = args->out_bounds;                       \
-        for (int i = 0; i < (DIM - 2); i++) {                                \
-          if ((bounds.hi[i] > 0) || (out_bounds.hi[i] == 0))                 \
-            continue;                                                        \
-          has_broadcast = true;                                              \
-          break;                                                             \
-        }                                                                    \
-      } else {                                                               \
-        has_broadcast = true;                                                \
-      }                                                                      \
-    }                                                                        \
-    break;                                                                   \
-  }
-    LEGION_FOREACH_N(DIMFUNC)
-#undef DIMFUNC
-    default:
-      abort();
-  }
-  if (has_broadcast) {
-    assert(args->out_bounds.get_dim() > 2);
-    std::vector<size_t> in1_iterations(args->out_bounds.get_dim() - 2, 1);
-    std::vector<size_t> in2_iterations(args->out_bounds.get_dim() - 2, 1);
-    std::vector<size_t> out_iterations(args->out_bounds.get_dim() - 2);
-    const int in1_dims = args->in1_bounds.get_dim();
-    for (int off = 0; off < (args->in1_bounds.get_dim() - 2); off++)
-      in1_iterations[in1_iterations.size() - off] =
-          (args->in1_bounds.hi()[in1_dims - (off + 2)] -
-           args->in1_bounds.lo()[in1_dims - (off + 2)]) +
-          1;
-    const int in2_dims = args->in2_bounds.get_dim();
-    for (int off = 0; off < (args->in2_bounds.get_dim() - 2); off++)
-      in2_iterations[in2_iterations.size() - off] =
-          (args->in2_bounds.hi()[in2_dims - (off + 2)] -
-           args->in2_bounds.lo()[in2_dims - (off + 2)]) +
-          1;
-    for (unsigned dim = 0; dim < out_iterations.size(); dim++)
-      out_iterations[dim] =
-          (args->out_bounds.hi()[dim] - args->out_bounds.lo()[dim]) + 1;
-    // Find the "last full dim" without a broadcast
-    int last_full_dim = in1_iterations.size();
-    size_t partial_batch_count = 1;
-    for (int idx = in1_iterations.size() - 1; idx >= 0; --idx) {
-      if (in1_iterations[idx] == in2_iterations[idx]) {
-        last_full_dim = idx;
-        partial_batch_count *= in1_iterations[idx];
-        continue;
-      }
-      assert((in1_iterations[idx] == 1) || (in2_iterations[idx] == 1));
-      break;
-    }
-    assert(last_full_dim > 0);
-    assert((batch_count % partial_batch_count) == 0);
-    std::vector<size_t> in1_indexes(args->out_bounds.get_dim() - 2, 0);
-    std::vector<size_t> in2_indexes(args->out_bounds.get_dim() - 2, 0);
-    std::vector<size_t> out_indexes(args->out_bounds.get_dim() - 2, 0);
-    while (batch_count > 0) {
-      // iterate the loops
-      for (int dim = last_full_dim - 1; dim >= 0; dim++) {
-        if (++out_indexes[dim] < out_iterations[dim]) {
-          // step the in1 and in2 indexes while checking for broadcasting
-          if (in1_iterations[dim] > 1) {
-            ++in1_indexes[dim];
-            assert(in1_indexes[dim] == out_indexes[dim]);
-          }
-          if (in2_iterations[dim] > 1) {
-            ++in2_indexes[dim];
-            assert(in2_indexes[dim] == out_indexes[dim]);
-          }
-          break;
-        } else {
-          // reset and ripple carry over to the next dim
-          in1_indexes[dim] = 0;
-          in2_indexes[dim] = 0;
-          out_indexes[dim] = 0;
-          assert(dim > 0);
-        }
-      }
-      // compute the local pointers based on our indexes
-      size_t in1_offset = in1_indexes[0];
-      size_t in2_offset = in2_indexes[0];
-      size_t out_offset = out_indexes[0];
-      for (int dim = 1; dim < last_full_dim; dim++) {
-        in1_offset = in1_offset * in1_iterations[dim] + in1_indexes[dim];
-        in2_offset = in2_offset * in2_iterations[dim] + in2_indexes[dim];
-        out_offset = out_offset * out_iterations[dim] + out_indexes[dim];
-      }
-      in1_offset *=
-          partial_batch_count * bstride * sizeof_datatype(args->in1_datatype);
-      in2_offset *=
-          partial_batch_count * astride * sizeof_datatype(args->in2_datatype);
-      out_offset *=
-          partial_batch_count * cstride * sizeof_datatype(args->out_datatype);
-      const uint8_t* in1_local = in1_ptr + in1_offset;
-      const uint8_t* in2_local = in2_ptr + in2_offset;
-      uint8_t* out_local = out_ptr + out_offset;
-      switch (args->out_datatype) {
-        // Use 32-bit intermediate for 16-bit float
-        case DT_HALF:
-        case DT_FLOAT: {
-          float alpha = 1.f, beta = 0.f;
-          CHECK_CUBLAS(cublasGemmStridedBatchedEx(
-              args->cublas, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &alpha,
-              in2_local, to_cuda_datatype(args->in2_datatype), lda, astride,
-              in1_local, to_cuda_datatype(args->in1_datatype), ldb, bstride,
-              &beta, out_local, to_cuda_datatype(args->out_datatype), ldc,
-              cstride, partial_batch_count, CUBLAS_COMPUTE_32F,
-              CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-          break;
-        }
-        case DT_DOUBLE: {
-          double alpha = 1.0, beta = 0.0;
-          CHECK_CUBLAS(cublasGemmStridedBatchedEx(
-              args->cublas, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &alpha,
-              in2_local, to_cuda_datatype(args->in2_datatype), lda, astride,
-              in1_local, to_cuda_datatype(args->in1_datatype), ldb, bstride,
-              &beta, out_local, to_cuda_datatype(DT_DOUBLE), ldc, cstride,
-              partial_batch_count, CUBLAS_COMPUTE_64F,
-              CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-          break;
-        }
-        case DT_INT32: {
-          int32_t alpha = 1, beta = 0;
-          CHECK_CUBLAS(cublasGemmStridedBatchedEx(
-              args->cublas, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &alpha, in2_ptr,
-              to_cuda_datatype(args->in2_datatype), lda, astride, in1_ptr,
-              to_cuda_datatype(args->in1_datatype), ldb, bstride, &beta,
-              out_ptr, to_cuda_datatype(DT_INT32), ldc, cstride,
-              partial_batch_count, CUBLAS_COMPUTE_32I,
-              CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-          break;
-        }
-        default:
-          fprintf(
-              stderr, "Unsupported cublas type for matmul %d\n",
-              args->out_datatype);
-          abort();
-      }
-      batch_count -= partial_batch_count;
-    }
-  } else {
-    // This is the easy case where there are no broadcasts
-    // so we can do the full batch matmul in a single call
-    switch (args->out_datatype) {
-      // Use 32-bit intermediate for 16-bit float
-      case DT_HALF:
-      case DT_FLOAT: {
-        float alpha = 1.f, beta = 0.f;
-        CHECK_CUBLAS(cublasGemmStridedBatchedEx(
-            args->cublas, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &alpha, in2_ptr,
-            to_cuda_datatype(args->in2_datatype), lda, astride, in1_ptr,
-            to_cuda_datatype(args->in1_datatype), ldb, bstride, &beta, out_ptr,
-            to_cuda_datatype(args->out_datatype), ldc, cstride, batch_count,
-            CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-        break;
-      }
-      case DT_DOUBLE: {
-        double alpha = 1.0, beta = 0.0;
-        CHECK_CUBLAS(cublasGemmStridedBatchedEx(
-            args->cublas, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &alpha, in2_ptr,
-            to_cuda_datatype(args->in2_datatype), lda, astride, in1_ptr,
-            to_cuda_datatype(args->in1_datatype), ldb, bstride, &beta, out_ptr,
-            to_cuda_datatype(DT_DOUBLE), ldc, cstride, batch_count,
-            CUBLAS_COMPUTE_64F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-        break;
-      }
-      case DT_INT32: {
-        int32_t alpha = 1, beta = 0;
-        CHECK_CUBLAS(cublasGemmStridedBatchedEx(
-            args->cublas, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &alpha, in2_ptr,
-            to_cuda_datatype(args->in2_datatype), lda, astride, in1_ptr,
-            to_cuda_datatype(args->in1_datatype), ldb, bstride, &beta, out_ptr,
-            to_cuda_datatype(DT_INT32), ldc, cstride, batch_count,
-            CUBLAS_COMPUTE_32I, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-        break;
-      }
-      default:
-        fprintf(
-            stderr, "Unsupported cublas type for matmul %d\n",
-            args->out_datatype);
-        abort();
-    }
-  }
-  if (args->profiling) {
-#ifdef DISABLE_LEGION_CUDA_HIJACK
-    CHECK_CUDA(cudaEventRecord(t_end));
-#else
-    CHECK_CUDA(cudaEventRecord(t_start, stream));
-#endif
-    CHECK_CUDA(cudaEventSynchronize(t_end));
-    float elapsed = 0;
-    CHECK_CUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
-    CHECK_CUDA(cudaEventDestroy(t_start));
-    CHECK_CUDA(cudaEventDestroy(t_end));
-    printf(
-        "%s [MatMul] forward time = %.2fms\n", args->owner->op_name.c_str(),
-        elapsed);
-  }
-}
-#endif
-
-}}}  // namespace triton::backend::legion
diff --git a/triton/src/operators/matmul.h b/triton/src/operators/matmul.h
deleted file mode 100644
index 4207d1f85c..0000000000
--- a/triton/src/operators/matmul.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __LEGION_TRITON_MATMUL_H__
-#define __LEGION_TRITON_MATMUL_H__
-
-#include "operator.h"
-#include "tensor.h"
-
-namespace triton { namespace backend { namespace legion {
-
-class MatMulProjectionFunctor : public Legion::ProjectionFunctor {
- public:
-  MatMulProjectionFunctor(
-      Legion::ProjectionID id, const Legion::DomainTransform& transform);
-
- public:
-  inline Legion::DomainPoint transform(const Legion::DomainPoint& point) const
-  {
-    return domain_transform * point;
-  }
-
- public:
-  virtual bool is_functional(void) const override { return true; }
-  virtual unsigned get_depth(void) const override { return 0; }
-  virtual Legion::LogicalRegion project(
-      Legion::LogicalPartition upper_bound, const Legion::DomainPoint& point,
-      const Legion::Domain& domain) override;
-
- public:
-  const Legion::ProjectionID functor_id;
-  const Legion::DomainTransform domain_transform;
-};
-
-class MatMul;
-
-struct MatMulArgs : public OperatorArgs {
- public:
-  MatMulArgs(void);
-  MatMul* owner;
-  Legion::Domain in1_bounds, in2_bounds, out_bounds;
-  DataType in1_datatype, in2_datatype, out_datatype;
-#ifdef LEGION_USE_CUDA
-  cublasHandle_t cublas;
-#endif
-};
-
-class MatMul : public Operator {
- public:
-  MatMul(
-      LegionModelState* model, const LayerStrategy* strategy, const char* name);
-
-  void Configure(Tensor* in1, Tensor* in2, Tensor* output);
-  Legion::Domain GetIn1Bounds(Realm::Processor proc);
-  Legion::Domain GetIn2Bounds(Realm::Processor proc);
-  Legion::Domain GetOutBounds(Realm::Processor proc);
-
-  virtual void Load(Realm::Processor processor) override;
-  virtual void initialize(
-      LegionModelInstance* instance, const unsigned instance_index,
-      Legion::Runtime* runtime, Legion::Context ctx,
-      Legion::MapperID mapper) override;
-  virtual void forward(
-      LegionModelInstance* instance, const unsigned instance_index,
-      Legion::Runtime* runtime, Legion::Context ctx,
-      Legion::MapperID mapper) override;
-  virtual void finalize(
-      LegionModelInstance* instance, const unsigned instance_index,
-      Legion::Runtime* runtime, Legion::Context ctx,
-      Legion::MapperID mapper) override;
-  virtual void Free(Realm::Processor processor) override;
-
-  static void PreregisterTaskVariants(void);
-
-  static void forward_cpu(
-      const Legion::Task* task,
-      const std::vector<Legion::PhysicalRegion>& regions, Legion::Context ctx,
-      Legion::Runtime* runtime);
-
-#ifdef LEGION_USE_CUDA
-  static void forward_gpu(
-      const Legion::Task* task,
-      const std::vector<Legion::PhysicalRegion>& regions, Legion::Context ctx,
-      Legion::Runtime* runtime);
-#endif
- protected:
-  template <unsigned DIM>
-  void compute_in1_parameters(Tensor* in1, Tensor* out);
-  template <unsigned DIM>
-  void compute_in2_parameters(Tensor* in2, Tensor* out);
-
- protected:
-  template <unsigned DIM>
-  static void generate_all_functors(void);
-  template <unsigned IDIM, unsigned ODIM>
-  static void generate_specific_functors(void);
-
- protected:
-  MatMulArgs args[MAX_LOCAL_PROCS];
-  MatMulProjectionFunctor *in1_proj, *in2_proj;
-  Legion::DomainTransform in1_transform, in2_transform;
-  Legion::Domain in1_extent, in2_extent;
-  Legion::Domain in1_colors, in2_colors;
-  Legion::FutureMap argmaps[MAX_NUM_INSTANCES];
-  Legion::IndexTaskLauncher launchers[MAX_NUM_INSTANCES];
-
- public:
-  // for looking up projection functor IDs
-  // keys are <input dimensions,output dimensions,one-hot encoded broadcasting>
-  typedef std::tuple<unsigned, unsigned, unsigned> FunctorKey;
-  typedef std::map<FunctorKey, MatMulProjectionFunctor*> FunctorTable;
-  static FunctorTable in1_functors, in2_functors;
-};
-
-}}}  // namespace triton::backend::legion
-
-#endif  // __LEGION_TRITON_MATMUL_H__
diff --git a/triton/src/operators/pool2d.cc b/triton/src/operators/pool2d.cc
deleted file mode 100644
index d4b2cde9eb..0000000000
--- a/triton/src/operators/pool2d.cc
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "pool2d.h"
-
-namespace triton { namespace backend { namespace legion {
-
-// FIXME below are stub functions, need to fill in implementation
-Pool2D::Pool2D(
-    LegionModelState* model, const LayerStrategy* strategy, int kernelH,
-    int kernelW, int strideH, int strideW, int paddingH, int paddingW,
-    PoolType type, ActivationMode act, const char* name)
-    : Operator(model, strategy, OperatorType::OP_POOL2D, name, 1, 0, 1),
-      activation(act), pool_type(type), kernel_h(kernelH), kernel_w(kernelW),
-      stride_h(strideH), stride_w(strideW), padding_h(paddingH),
-      padding_w(paddingW)
-{
-}
-
-Pool2D::~Pool2D() {}
-void
-Pool2D::Load(Realm::Processor processor)
-{
-}
-void
-Pool2D::Free(Realm::Processor processor)
-{
-}
-void
-Pool2D::Configure(Tensor* input, Tensor* output)
-{
-}
-void
-Pool2D::initialize(
-    LegionModelInstance* instance, const unsigned instance_index,
-    Legion::Runtime* runtime, Legion::Context ctx, Legion::MapperID mapper)
-{
-}
-void
-Pool2D::forward(
-    LegionModelInstance* instance, const unsigned instance_index,
-    Legion::Runtime* runtime, Legion::Context ctx, Legion::MapperID mapper)
-{
-}
-void
-Pool2D::finalize(
-    LegionModelInstance* instance, const unsigned instance_index,
-    Legion::Runtime* runtime, Legion::Context ctx, Legion::MapperID mapper)
-{
-}
-Pool2DArgs::Pool2DArgs(void) {}
-
-#ifdef LEGION_USE_CUDA
-Pool2DArgs
-Pool2D::initialize_gpu(
-    const Legion::Task* task,
-    const std::vector<Legion::PhysicalRegion>& regions, Legion::Context ctx,
-    Legion::Runtime* runtime)
-{
-  return Pool2DArgs();
-}
-void
-Pool2D::forward_gpu(
-    const Legion::Task* task,
-    const std::vector<Legion::PhysicalRegion>& regions, Legion::Context ctx,
-    Legion::Runtime* runtime)
-{
-}
-void
-Pool2D::forward_kernel(
-    const Pool2DArgs* args, const void* input_ptr, void* output_ptr)
-{
-}
-#endif
-
-}}}  // namespace triton::backend::legion
diff --git a/triton/src/operators/pool2d.h b/triton/src/operators/pool2d.h
deleted file mode 100644
index e721fc331b..0000000000
--- a/triton/src/operators/pool2d.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __LEGION_TRITON_POOL2D_H__
-#define __LEGION_TRITON_POOL2D_H__
-
-#include "operator.h"
-#include "tensor.h"
-
-namespace triton { namespace backend { namespace legion {
-
-struct Pool2DArgs : public OperatorArgs {
- public:
-  Pool2DArgs(void);
-#ifdef LEGION_USE_CUDA
-  cudnnTensorDescriptor_t inputTensor, outputTensor;
-  cudnnActivationDescriptor_t actiDesc;
-  cudnnPoolingDescriptor_t poolDesc;
-#endif
-  bool relu;
-};
-
-class Pool2D : public Operator {
- public:
-  Pool2D(
-      LegionModelState* model, const LayerStrategy* strategy, int kernelH,
-      int kernelW, int strideH, int strideW, int paddingH, int paddingW,
-      PoolType type, ActivationMode activation, const char* name);
-  virtual ~Pool2D(void);
-
-  void Configure(Tensor* input, Tensor* output);
-
-  virtual void Load(Realm::Processor processor) override;
-  virtual void initialize(
-      LegionModelInstance* instance, const unsigned instance_index,
-      Legion::Runtime* runtime, Legion::Context ctx,
-      Legion::MapperID mapper) override;
-  virtual void forward(
-      LegionModelInstance* instance, const unsigned instance_index,
-      Legion::Runtime* runtime, Legion::Context ctx,
-      Legion::MapperID mapper) override;
-  virtual void finalize(
-      LegionModelInstance* instance, const unsigned instance_index,
-      Legion::Runtime* runtime, Legion::Context ctx,
-      Legion::MapperID mapper) override;
-  virtual void Free(Realm::Processor processor) override;
-
-#ifdef LEGION_USE_CUDA
-  static Pool2DArgs initialize_gpu(
-      const Legion::Task* task,
-      const std::vector<Legion::PhysicalRegion>& regions, Legion::Context ctx,
-      Legion::Runtime* runtime);
-  static void forward_gpu(
-      const Legion::Task* task,
-      const std::vector<Legion::PhysicalRegion>& regions, Legion::Context ctx,
-      Legion::Runtime* runtime);
-  static void forward_kernel(
-      const Pool2DArgs* args, const void* input_ptr, void* output_ptr);
-#endif
- public:
-  const ActivationMode activation;
-  const PoolType pool_type;
-  const int kernel_h, kernel_w, stride_h, stride_w, padding_h, padding_w;
-};
-
-}}}  // namespace triton::backend::legion
-
-#endif  // __LEGION_TRITON_POOL2D_H__
diff --git a/triton/src/operators/reshape.cc b/triton/src/operators/reshape.cc
deleted file mode 100644
index e5e38075e4..0000000000
--- a/triton/src/operators/reshape.cc
+++ /dev/null
@@ -1,409 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "reshape.h"
-
-using namespace Legion;
-
-namespace triton { namespace backend { namespace legion {
-
-Reshape::Reshape(
-    LegionModelState* model, const LayerStrategy* strategy, const char* name)
-    : Operator(model, strategy, OperatorType::OP_RESHAPE, name, 1, 0, 1)
-{
-}
-
-void
-Reshape::Configure(Tensor* input, Tensor* output)
-{
-  assert(input != nullptr);
-  assert(output != nullptr);
-  assert(input->type == output->type);
-  // Make sure that they have the same volumes
-  size_t input_volume = 1, output_volume = 1;
-  for (unsigned idx = 0; idx < input->bounds.size(); idx++)
-    input_volume *= input->bounds[idx];
-  for (unsigned idx = 0; idx < output->bounds.size(); idx++)
-    output_volume *= output->bounds[idx];
-  assert(input_volume == output_volume);
-
-  // Group dimensions from the two input tensors together from
-  // right-to-left to find ones that can be tiles together
-  int input_idx = input->bounds.size() - 1;
-  int output_idx = output->bounds.size() - 1;
-  while ((input_idx >= 0) && (output_idx >= 0)) {
-    std::vector<int> input_dims(1, input_idx);
-    std::vector<int> output_dims(1, output_idx);
-    size_t input_tile_volume = input->bounds[input_idx--];
-    size_t output_tile_volume = output->bounds[output_idx--];
-    while (input_tile_volume != output_tile_volume) {
-      if (input_tile_volume < output_tile_volume) {
-        input_dims.push_back(input_idx);
-        input_tile_volume *= input->bounds[input_idx--];
-      } else {
-        output_dims.push_back(output_idx);
-        output_tile_volume *= output->bounds[output_idx--];
-      }
-    }
-    input_groups.emplace_back(input_dims);
-    output_groups.emplace_back(output_dims);
-  }
-  // In order to use the output launch space, we need to make sure that
-  // all but the earliest dimension in each output group has a partitioning
-  // strategy of 1 or else we won't be able to compute a partition that
-  // will allow for densely tiled copies. In the future we could fix this
-  // by computing a generalized index launch space and then mapping that
-  // onto the original output launch space or just by using affine indirect
-  // copy launchers when they are available.
-  for (unsigned g = 0; g < output_groups.size(); g++) {
-    const std::vector<int>& input_group = input_groups[g];
-    const std::vector<int>& output_group = output_groups[g];
-    for (unsigned idx = 0; idx < (output_group.size() - 1); idx++)
-      assert(strategy->dim[output_group[idx]] == 1);
-    // the size of the earliest dimension in the input group must also
-    // be divisible by the number of chunks
-    assert(
-        (input->bounds[input_group.back()] %
-         strategy->dim[output_group.back()]) == 0);
-    // the output bounds also need to be evenly divisible too or this will not
-    // work
-    assert(
-        (output->bounds[output_group.back()] %
-         strategy->dim[output_group.back()]) == 0);
-  }
-  inputs.push_back(input);
-  outputs.push_back(output);
-}
-
-Domain
-Reshape::GetInputBounds(Processor proc)
-{
-  const DomainPoint local_point = strategy->find_local_point(proc);
-  DomainPoint lo, hi;
-  const int input_dims = inputs[0]->bounds.size();
-  lo.dim = input_dims;
-  hi.dim = input_dims;
-  for (unsigned g = 0; g < input_groups.size(); g++) {
-    const std::vector<int>& input_group = input_groups[g];
-    const std::vector<int>& output_group = output_groups[g];
-    // Everything but the first dimension in the group is full size
-    // Remember that dimensions are in reverse order
-    for (unsigned idx = 0; idx < (input_group.size() - 1); idx++) {
-      int dim = input_group[idx];
-      lo[dim] = 0;
-      hi[dim] = inputs[0]->bounds[dim] - 1;
-    }
-    // For the first dimension, divide it by the chunk of the
-    // corresponding output dimension
-    int input_dim = input_group.back();
-    int output_dim = output_group.back();
-    assert(output_dim < local_point.dim);
-    assert(output_dim < strategy->nDims);
-    size_t chunks = strategy->dim[output_dim];
-    assert((inputs[0]->bounds[input_dim] % chunks) == 0);
-    size_t chunk = inputs[0]->bounds[input_dim] / chunks;
-    lo[input_dim] = local_point[output_dim] * chunk;
-    hi[input_dim] = lo[input_dim] + chunk - 1;
-  }
-  return Domain(lo, hi);
-}
-
-Domain
-Reshape::GetOutputBounds(Processor proc)
-{
-  assert(outputs[0]->bounds.size() == size_t(strategy->nDims));
-  const size_t dims = outputs[0]->bounds.size();
-  DomainPoint lo, hi;
-  lo.dim = dims;
-  hi.dim = dims;
-  for (int d = 0; d < dims; d++) {
-    lo[d] = 0;
-    hi[d] = outputs[0]->bounds[d] - 1;
-  }
-  const Domain global(lo, hi);
-  return strategy->find_local_domain(proc, global);
-}
-
-void
-Reshape::Load(Processor proc)
-{
-  assert(proc.kind() == strategy->kind);
-  // If this processor is not used for this layer there is nothing to do
-  if (!strategy->is_local_processor(proc))
-    return;
-  const unsigned local_index = strategy->find_local_offset(proc);
-  ReshapeArgs& proc_args = args[local_index];
-  proc_args.owner = this;
-  proc_args.input_bounds = GetInputBounds(proc);
-  proc_args.output_bounds = GetOutputBounds(proc);
-  proc_args.datatype = outputs[0]->type;
-  // volumes of the tiles should be the same
-  assert(
-      proc_args.input_bounds.get_volume() ==
-      proc_args.output_bounds.get_volume());
-  proc_args.copy_size =
-      proc_args.input_bounds.get_volume() * sizeof_datatype(proc_args.datatype);
-}
-
-void
-Reshape::initialize(
-    LegionModelInstance* instance, const unsigned instance_index,
-    Runtime* runtime, Context ctx, MapperID mapper)
-{
-  const Domain launch_domain = strategy->get_launch_domain();
-  // Find or create the launch space domain
-  IndexSpace launch_space = instance->find_or_create_index_space(launch_domain);
-  // Also get the sharding function from the strategy
-  ShardingFunction* shardfn = strategy->sharding_function;
-  // Construct a future map for the pass-by-value arguments
-  std::map<DomainPoint, TaskArgument> values;
-  for (Domain::DomainPointIterator itr(launch_domain); itr; itr++) {
-    const Processor proc = shardfn->find_proc(itr.p, launch_domain);
-    if (!strategy->is_local_processor(proc))
-      continue;
-    const unsigned local_index = strategy->find_local_offset(proc);
-    values[itr.p] = TaskArgument(args + local_index, sizeof(ReshapeArgs));
-  }
-  argmaps[instance_index] = runtime->construct_future_map(
-      ctx, launch_space, values, true /*collective*/, shardfn->sharding_id);
-
-  IndexTaskLauncher& launcher = launchers[instance_index];
-  launcher = IndexTaskLauncher(
-      RESHAPE_TASK_ID, launch_space, TaskArgument(NULL, 0),
-      ArgumentMap(argmaps[instance_index]), Predicate::TRUE_PRED,
-      false /*must*/, mapper, strategy->tag);
-  LogicalRegion input_region = inputs[0]->region[instance_index];
-  assert(outputs.size() == 1);
-  LogicalRegion output_region = instance->create_tensor_region(outputs[0]);
-
-  // Create partitions for the regions
-  DomainTransform transform;
-  transform.m = inputs[0]->bounds.size();
-  transform.n = outputs[0]->bounds.size();
-  for (int i = 0; i < transform.m; i++)
-    for (int j = 0; j < transform.n; j++)
-      transform.matrix[i * transform.n + j] = 0;
-  DomainPoint lo, hi;
-  lo.dim = transform.m;
-  hi.dim = transform.m;
-  for (unsigned g = 0; g < input_groups.size(); g++) {
-    const std::vector<int>& input_group = input_groups[g];
-    const std::vector<int>& output_group = output_groups[g];
-    // Everything but the first dimension in the group is full size
-    // Remember that dimensions are in reverse order
-    for (unsigned idx = 0; idx < (input_group.size() - 1); idx++) {
-      int dim = input_group[idx];
-      lo[dim] = 0;
-      hi[dim] = inputs[0]->bounds[dim] - 1;
-    }
-    // For the first dimension, divide it by the chunk of the
-    // corresponding output dimension
-    int input_dim = input_group.back();
-    int output_dim = output_group.back();
-    assert(output_dim < strategy->nDims);
-    size_t chunks = strategy->dim[output_dim];
-    assert((inputs[0]->bounds[input_dim] % chunks) == 0);
-    size_t chunk = inputs[0]->bounds[input_dim] / chunks;
-    lo[input_dim] = 0;
-    hi[input_dim] = chunk - 1;
-    transform.matrix[input_dim * transform.n + output_dim] = 1;
-  }
-  Domain extent(lo, hi);
-  IndexPartition index_part = instance->find_or_create_partition(
-      input_region.get_index_space(), launch_space, transform, extent,
-      LEGION_DISJOINT_COMPLETE_KIND);
-  LogicalPartition input_part = runtime->get_logical_partition_by_tree(
-      ctx, index_part, input_region.get_field_space(),
-      input_region.get_tree_id());
-  LogicalPartition output_part =
-      instance->find_or_create_tiled_partition(outputs[0], strategy);
-  launcher.add_region_requirement(RegionRequirement(
-      input_part, 0 /*projection id*/, LEGION_READ_ONLY, LEGION_EXCLUSIVE,
-      input_region));
-  launcher.add_field(0, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(
-      output_part, 0 /*projection id*/, LEGION_WRITE_DISCARD, LEGION_EXCLUSIVE,
-      output_region));
-  launcher.add_field(1, FID_DATA);
-}
-
-void
-Reshape::forward(
-    LegionModelInstance* instance, const unsigned instance_index,
-    Runtime* runtime, Context ctx, MapperID mapper)
-{
-  runtime->execute_index_space(ctx, launchers[instance_index]);
-}
-
-void
-Reshape::finalize(
-    LegionModelInstance* instance, const unsigned instance_index,
-    Runtime* runtime, Context ctx, MapperID mapper)
-{
-  argmaps[instance_index] = FutureMap();
-}
-
-void
-Reshape::Free(Processor proc)
-{
-  // Nothing to do in this case
-}
-
-/*static*/ void
-Reshape::PreregisterTaskVariants(void)
-{
-  {
-    TaskVariantRegistrar cpu_registrar(RESHAPE_TASK_ID, "Reshape CPU");
-    cpu_registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    cpu_registrar.set_leaf();
-    Runtime::preregister_task_variant<forward_cpu>(
-        cpu_registrar, "Reshape Operator");
-  }
-#ifdef LEGION_USE_CUDA
-  {
-    TaskVariantRegistrar gpu_registrar(RESHAPE_TASK_ID, "Reshape GPU");
-    gpu_registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
-    gpu_registrar.set_leaf();
-    Runtime::preregister_task_variant<forward_gpu>(
-        gpu_registrar, "Reshape Operator");
-  }
-#endif
-}
-
-/*static*/ void
-Reshape::forward_cpu(
-    const Task* task, const std::vector<PhysicalRegion>& regions, Context ctx,
-    Runtime* runtime)
-{
-  assert(task->local_arglen == sizeof(ReshapeArgs));
-  const ReshapeArgs* args = (const ReshapeArgs*)task->local_args;
-  assert(regions.size() == 2);
-  assert(task->regions.size() == 2);
-  const void* input_ptr = nullptr;
-  void* output_ptr = nullptr;
-  size_t volume = 0;
-  switch (args->input_bounds.get_dim()) {
-#define DIMFUNC(DIM)                                           \
-  case DIM: {                                                  \
-    const Rect<DIM> bounds = args->input_bounds;               \
-    volume = bounds.volume();                                  \
-    input_ptr = TensorAccessor<LEGION_READ_ONLY, DIM>::access( \
-        args->datatype, bounds, regions[0]);                   \
-    break;                                                     \
-  }
-    LEGION_FOREACH_N(DIMFUNC)
-#undef DIMFUNC
-    default:
-      abort();
-  }
-  switch (args->output_bounds.get_dim()) {
-#define DIMFUNC(DIM)                                                \
-  case DIM: {                                                       \
-    const Rect<DIM> bounds = args->output_bounds;                   \
-    output_ptr = TensorAccessor<LEGION_WRITE_DISCARD, DIM>::access( \
-        args->datatype, bounds, regions[1]);                        \
-    break;                                                          \
-  }
-    LEGION_FOREACH_N(DIMFUNC)
-#undef DIMFUNC
-    default:
-      abort();
-  }
-  memcpy(output_ptr, input_ptr, args->copy_size);
-}
-
-ReshapeArgs::ReshapeArgs(void) {}
-
-#ifdef LEGION_USE_CUDA
-/*static*/ void
-Reshape::forward_gpu(
-    const Task* task, const std::vector<PhysicalRegion>& regions, Context ctx,
-    Runtime* runtime)
-{
-  assert(task->local_arglen == sizeof(ReshapeArgs));
-  const ReshapeArgs* args = (const ReshapeArgs*)task->local_args;
-#ifndef DISABLE_LEGION_CUDA_HIJACK
-  ::cudaStream_t stream;
-  CHECK_CUDA(cudaStreamCreate(&stream));
-#endif
-  ::cudaEvent_t t_start, t_end;
-  if (args->profiling) {
-    CHECK_CUDA(cudaEventCreate(&t_start));
-    CHECK_CUDA(cudaEventCreate(&t_end));
-#ifdef DISABLE_LEGION_CUDA_HIJACK
-    CHECK_CUDA(cudaEventRecord(t_start));
-#else
-    CHECK_CUDA(cudaEventRecord(t_start, stream));
-#endif
-  }
-  assert(regions.size() == 2);
-  assert(task->regions.size() == 2);
-  const void* input_ptr = nullptr;
-  void* output_ptr = nullptr;
-  size_t volume = 0;
-  switch (args->input_bounds.get_dim()) {
-#define DIMFUNC(DIM)                                           \
-  case DIM: {                                                  \
-    const Rect<DIM> bounds = args->input_bounds;               \
-    volume = bounds.volume();                                  \
-    input_ptr = TensorAccessor<LEGION_READ_ONLY, DIM>::access( \
-        args->datatype, bounds, regions[0]);                   \
-    break;                                                     \
-  }
-    LEGION_FOREACH_N(DIMFUNC)
-#undef DIMFUNC
-    default:
-      abort();
-  }
-  switch (args->output_bounds.get_dim()) {
-#define DIMFUNC(DIM)                                                \
-  case DIM: {                                                       \
-    const Rect<DIM> bounds = args->output_bounds;                   \
-    output_ptr = TensorAccessor<LEGION_WRITE_DISCARD, DIM>::access( \
-        args->datatype, bounds, regions[1]);                        \
-    break;                                                          \
-  }
-    LEGION_FOREACH_N(DIMFUNC)
-#undef DIMFUNC
-    default:
-      abort();
-  }
-#ifdef DISABLE_LEGION_CUDA_HIJACK
-  CHECK_CUDA(cudaMemcpyAsync(
-      output_ptr, input_ptr, args->copy_size, cudaMemcpyDeviceToDevice));
-#else
-  CHECK_CUDA(cudaMemcpyAsync(
-      output_ptr, input_ptr, args->copy_size, cudaMemcpyDeviceToDevice,
-      stream));
-#endif
-  if (args->profiling) {
-#ifdef DISABLE_LEGION_CUDA_HIJACK
-    CHECK_CUDA(cudaEventRecord(t_end));
-#else
-    CHECK_CUDA(cudaEventRecord(t_start, stream));
-#endif
-    CHECK_CUDA(cudaEventSynchronize(t_end));
-    float elapsed = 0;
-    CHECK_CUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
-    CHECK_CUDA(cudaEventDestroy(t_start));
-    CHECK_CUDA(cudaEventDestroy(t_end));
-    printf(
-        "%s [Reshape] forward time (CF) = %.2fms\n",
-        args->owner->op_name.c_str(), elapsed);
-  }
-}
-#endif
-
-}}}  // namespace triton::backend::legion
diff --git a/triton/src/operators/reshape.h b/triton/src/operators/reshape.h
deleted file mode 100644
index 79669ca520..0000000000
--- a/triton/src/operators/reshape.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __LEGION_TRITON_RESHAPE_H__
-#define __LEGION_TRITON_RESHAPE_H__
-
-#include "operator.h"
-#include "tensor.h"
-
-namespace triton { namespace backend { namespace legion {
-
-struct ReshapeArgs : public OperatorArgs {
- public:
-  ReshapeArgs(void);
-  Legion::Domain input_bounds;
-  Legion::Domain output_bounds;
-  DataType datatype;
-  size_t copy_size;
-};
-
-class Reshape : public Operator {
- public:
-  Reshape(
-      LegionModelState* model, const LayerStrategy* strategy, const char* name);
-
-  void Configure(Tensor* input, Tensor* output);
-  Legion::Domain GetInputBounds(Realm::Processor proc);
-  Legion::Domain GetOutputBounds(Realm::Processor proc);
-
-  virtual void Load(Realm::Processor processor) override;
-  virtual void initialize(
-      LegionModelInstance* instance, const unsigned instance_index,
-      Legion::Runtime* runtime, Legion::Context ctx,
-      Legion::MapperID mapper) override;
-  virtual void forward(
-      LegionModelInstance* instance, const unsigned instance_index,
-      Legion::Runtime* runtime, Legion::Context ctx,
-      Legion::MapperID mapper) override;
-  virtual void finalize(
-      LegionModelInstance* instance, const unsigned instance_index,
-      Legion::Runtime* runtime, Legion::Context ctx,
-      Legion::MapperID mapper) override;
-  virtual void Free(Realm::Processor processor) override;
-
-  static void PreregisterTaskVariants(void);
-
-  static void forward_cpu(
-      const Legion::Task* task,
-      const std::vector<Legion::PhysicalRegion>& regions, Legion::Context ctx,
-      Legion::Runtime* runtime);
-
-#ifdef LEGION_USE_CUDA
-  static void forward_gpu(
-      const Legion::Task* task,
-      const std::vector<Legion::PhysicalRegion>& regions, Legion::Context ctx,
-      Legion::Runtime* runtime);
-#endif
- protected:
-  std::vector<std::vector<int> > input_groups;
-  std::vector<std::vector<int> > output_groups;
-  ReshapeArgs args[MAX_LOCAL_PROCS];
-  Legion::FutureMap argmaps[MAX_NUM_INSTANCES];
-  Legion::IndexTaskLauncher launchers[MAX_NUM_INSTANCES];
-};
-
-}}}  // namespace triton::backend::legion
-
-#endif  // __LEGION_TRITON_RESHAPE_H__
diff --git a/triton/src/operators/softmax.cc b/triton/src/operators/softmax.cc
deleted file mode 100644
index 5254a97649..0000000000
--- a/triton/src/operators/softmax.cc
+++ /dev/null
@@ -1,373 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "softmax.h"
-
-using namespace Legion;
-
-namespace triton { namespace backend { namespace legion {
-
-Softmax::Softmax(
-    LegionModelState* model, const LayerStrategy* strategy, unsigned dim,
-    const char* name)
-    : Operator(model, strategy, OperatorType::OP_SOFTMAX, name, 1, 0, 1),
-      dim(dim)
-{
-}
-
-void
-Softmax::Configure(Tensor* input, Tensor* output)
-{
-  assert(input != nullptr);
-  assert(output != nullptr);
-  assert(input->type == output->type);
-  // Make sure that they have the same bounds
-  assert(input->bounds.size() == output->bounds.size());
-  for (unsigned idx = 0; idx < input->bounds.size(); idx++)
-    assert(input->bounds[idx] == output->bounds[idx]);
-  inputs.push_back(input);
-  outputs.push_back(output);
-}
-
-Domain
-Softmax::GetBounds(Processor proc)
-{
-  const size_t dims = outputs[0]->bounds.size();
-  DomainPoint lo, hi;
-  lo.dim = dims;
-  hi.dim = dims;
-  for (int d = 0; d < dims; d++) {
-    lo[d] = 0;
-    hi[d] = outputs[0]->bounds[d] - 1;
-  }
-  const Domain global(lo, hi);
-  return strategy->find_local_domain(proc, global);
-}
-
-void
-Softmax::Load(Processor proc)
-{
-  assert(proc.kind() == strategy->kind);
-  assert(inputs[0]->bounds.size() == size_t(strategy->nDims));
-  // Make sure that we don't have any partitions along the dimension
-  // on which we are going to perform the softmax computation
-  assert(strategy->dim[dim] == 1);
-  // If this processor is not used for this layer there is nothing to do
-  if (!strategy->is_local_processor(proc))
-    return;
-  const unsigned local_index = strategy->find_local_offset(proc);
-  SoftmaxArgs& proc_args = args[local_index];
-  proc_args.owner = this;
-  proc_args.local_index = local_index;
-  proc_args.bounds = GetBounds(proc);
-  proc_args.datatype = inputs[0]->type;
-  proc_args.dim = dim;
-#ifdef LEGION_USE_CUDA
-  if (proc.kind() == Processor::TOC_PROC) {
-    proc_args.cudnn = model->runtime_->cudnn[local_index];
-    CHECK_CUDNN(cudnnCreateTensorDescriptor(&proc_args.inputTensor));
-    CHECK_CUDNN(cudnnCreateTensorDescriptor(&proc_args.outputTensor));
-    Domain cudnn_bounds;
-    cudnnTensorFormat_t format = CUDNN_TENSOR_NCHW;
-    // Need to figure out how to pad the bounds to make sure the
-    // softmax is done over the right dimension
-    switch (proc_args.bounds.get_dim()) {
-      case 1: {
-        assert(dim == 0);
-        DomainPoint lo, hi;
-        lo.dim = 4;
-        hi.dim = 4;
-        for (int d = 0; d < 4; d++) {
-          if (d == 1) {
-            lo[d] = proc_args.bounds.lo()[0];
-            hi[d] = proc_args.bounds.hi()[0];
-          } else {
-            lo[d] = 0;
-            hi[d] = 0;
-          }
-        }
-        cudnn_bounds = Domain(lo, hi);
-        break;
-      }
-      case 2: {
-        DomainPoint lo, hi;
-        lo.dim = 4;
-        hi.dim = 4;
-        if (dim == 0) {
-          lo[0] = 0;
-          hi[0] = 0;
-          for (int d = 1; d <= 2; d++) {
-            lo[d] = proc_args.bounds.lo()[d - 1];
-            hi[d] = proc_args.bounds.hi()[d - 1];
-          }
-          lo[3] = 0;
-          hi[3] = 0;
-        } else {
-          assert(dim == 1);
-          format = CUDNN_TENSOR_NHWC;
-          for (int d = 0; d < 2; d++) {
-            lo[d] = 0;
-            hi[d] = 0;
-          }
-          for (int d = 2; d < 4; d++) {
-            lo[d] = proc_args.bounds.lo()[d - 2];
-            hi[d] = proc_args.bounds.hi()[d - 2];
-          }
-        }
-        cudnn_bounds = Domain(lo, hi);
-        break;
-      }
-      case 3: {
-        DomainPoint lo, hi;
-        lo.dim = 4;
-        hi.dim = 4;
-        if (dim < 2) {
-          if (dim == 0) {
-            lo[0] = 0;
-            hi[0] = 0;
-          } else {
-            lo[3] = 0;
-            hi[3] = 0;
-          }
-          for (int d = 1; d <= 3; d++) {
-            lo[d - dim] = proc_args.bounds.lo()[d - 1];
-            hi[d - dim] = proc_args.bounds.hi()[d - 1];
-          }
-        } else {
-          assert(dim == 2);
-          format = CUDNN_TENSOR_NHWC;
-          lo[0] = 0;
-          hi[0] = 0;
-          for (int d = 1; d < 4; d++) {
-            lo[d] = proc_args.bounds.lo()[d - 1];
-            hi[d] = proc_args.bounds.hi()[d - 1];
-          }
-        }
-        cudnn_bounds = Domain(lo, hi);
-        break;
-      }
-      case 4: {
-        if (dim == 0) {
-          // cudnn claims to support this type, but apparent not
-          // format = CUDNN_TENSOR_CHWN;
-          // cudnn_bounds = proc_args.bounds;
-          fprintf(stderr, "Unsupported cudnn softmax format");
-          abort();
-        } else if (dim == 1) {
-          format = CUDNN_TENSOR_NCHW;
-          cudnn_bounds = proc_args.bounds;
-        } else if (dim == 2) {
-          // There's no way to do this with cudnn even with 5-d tensors
-          // given the kinds of locations for the channel dimension that
-          // cudnn is willing to support
-          fprintf(stderr, "Unsupported cudnn softmax format");
-          abort();
-        } else {
-          assert(dim == 3);
-          format = CUDNN_TENSOR_NHWC;
-          cudnn_bounds = proc_args.bounds;
-        }
-        break;
-      }
-      default:
-        assert(false);
-    }
-    CHECK_CUDNN(cudnnSetTensorDescriptorFromDomain(
-        proc_args.inputTensor, cudnn_bounds, inputs[0]->type, format));
-    CHECK_CUDNN(cudnnSetTensorDescriptorFromDomain(
-        proc_args.outputTensor, cudnn_bounds, outputs[0]->type, format));
-  }
-#endif
-}
-
-void
-Softmax::initialize(
-    LegionModelInstance* instance, const unsigned instance_index,
-    Runtime* runtime, Context ctx, MapperID mapper)
-{
-  const Domain launch_domain = strategy->get_launch_domain();
-  // Find or create the launch space domain
-  IndexSpace launch_space = instance->find_or_create_index_space(launch_domain);
-  // Also get the sharding function from the strategy
-  ShardingFunction* shardfn = strategy->sharding_function;
-  // Construct a future map for the pass-by-value arguments
-  std::map<DomainPoint, TaskArgument> values;
-  for (Domain::DomainPointIterator itr(launch_domain); itr; itr++) {
-    const Processor proc = shardfn->find_proc(itr.p, launch_domain);
-    if (!strategy->is_local_processor(proc))
-      continue;
-    const unsigned local_index = strategy->find_local_offset(proc);
-    values[itr.p] = TaskArgument(args + local_index, sizeof(SoftmaxArgs));
-  }
-  argmaps[instance_index] = runtime->construct_future_map(
-      ctx, launch_space, values, true /*collective*/, shardfn->sharding_id);
-
-  IndexTaskLauncher& launcher = launchers[instance_index];
-  launcher = IndexTaskLauncher(
-      SOFTMAX_TASK_ID, launch_space, TaskArgument(NULL, 0),
-      ArgumentMap(argmaps[instance_index]), Predicate::TRUE_PRED,
-      false /*must*/, mapper, strategy->tag);
-  LogicalRegion input_region = inputs[0]->region[instance_index];
-  assert(outputs.size() == 1);
-  LogicalRegion output_region = instance->create_tensor_region(outputs[0]);
-
-  // Create partitions for the regions
-  LogicalPartition input_part =
-      instance->find_or_create_tiled_partition(inputs[0], strategy);
-  LogicalPartition output_part =
-      instance->find_or_create_tiled_partition(outputs[0], strategy);
-  launcher.add_region_requirement(RegionRequirement(
-      input_part, 0 /*projection id*/, LEGION_READ_ONLY, LEGION_EXCLUSIVE,
-      input_region));
-  launcher.add_field(0, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(
-      output_part, 0 /*projection id*/, LEGION_WRITE_DISCARD, LEGION_EXCLUSIVE,
-      output_region));
-  launcher.add_field(1, FID_DATA);
-}
-
-void
-Softmax::forward(
-    LegionModelInstance* instance, const unsigned instance_index,
-    Runtime* runtime, Context ctx, MapperID mapper)
-{
-  runtime->execute_index_space(ctx, launchers[instance_index]);
-}
-
-void
-Softmax::finalize(
-    LegionModelInstance* instance, const unsigned instance_index,
-    Runtime* runtime, Context ctx, MapperID mapper)
-{
-  argmaps[instance_index] = FutureMap();
-}
-
-void
-Softmax::Free(Processor proc)
-{
-  assert(proc.kind() == strategy->kind);
-  // If this processor is not used for this layer there is nothing to do
-  if (!strategy->is_local_processor(proc))
-    return;
-#ifdef LEGION_USE_CUDA
-  if (proc.kind() == Processor::TOC_PROC) {
-    const unsigned local_index = strategy->find_local_offset(proc);
-    SoftmaxArgs& proc_args = args[local_index];
-    CHECK_CUDNN(cudnnDestroyTensorDescriptor(proc_args.inputTensor));
-    CHECK_CUDNN(cudnnDestroyTensorDescriptor(proc_args.outputTensor));
-  }
-#endif
-}
-
-/*static*/ void
-Softmax::PreregisterTaskVariants(void)
-{
-  {
-    TaskVariantRegistrar cpu_registrar(SOFTMAX_TASK_ID, "Softmax CPU");
-    cpu_registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    cpu_registrar.set_leaf();
-    Runtime::preregister_task_variant<forward_cpu>(
-        cpu_registrar, "Softmax Operator");
-  }
-#ifdef LEGION_USE_CUDA
-  {
-    TaskVariantRegistrar gpu_registrar(SOFTMAX_TASK_ID, "Softmax GPU");
-    gpu_registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
-    gpu_registrar.set_leaf();
-    Runtime::preregister_task_variant<forward_gpu>(
-        gpu_registrar, "Softmax Operator");
-  }
-#endif
-}
-
-/*static*/ void
-Softmax::forward_cpu(
-    const Task* task, const std::vector<PhysicalRegion>& regions, Context ctx,
-    Runtime* runtime)
-{
-  // TODO: implement this
-  assert(false);
-}
-
-SoftmaxArgs::SoftmaxArgs(void) {}
-
-#ifdef LEGION_USE_CUDA
-/*static*/ void
-Softmax::forward_gpu(
-    const Task* task, const std::vector<PhysicalRegion>& regions, Context ctx,
-    Runtime* runtime)
-{
-  assert(task->local_arglen == sizeof(SoftmaxArgs));
-  const SoftmaxArgs* args = (const SoftmaxArgs*)task->local_args;
-#ifndef DISABLE_LEGION_CUDA_HIJACK
-  ::cudaStream_t stream;
-  CHECK_CUDA(cudaStreamCreate(&stream));
-  CHECK_CUDNN(cudnnSetStream(args->cudnn, stream));
-#endif
-  ::cudaEvent_t t_start, t_end;
-  if (args->profiling) {
-    CHECK_CUDA(cudaEventCreate(&t_start));
-    CHECK_CUDA(cudaEventCreate(&t_end));
-#ifdef DISABLE_LEGION_CUDA_HIJACK
-    CHECK_CUDA(cudaEventRecord(t_start));
-#else
-    CHECK_CUDA(cudaEventRecord(t_start, stream));
-#endif
-  }
-  assert(regions.size() == 2);
-  assert(task->regions.size() == 2);
-  const void* input_ptr = nullptr;
-  void* output_ptr = nullptr;
-  size_t volume = 0;
-  switch (args->bounds.get_dim()) {
-#define DIMFUNC(DIM)                                                \
-  case DIM: {                                                       \
-    const Rect<DIM> bounds = args->bounds;                          \
-    volume = bounds.volume();                                       \
-    input_ptr = TensorAccessor<LEGION_READ_ONLY, DIM>::access(      \
-        args->datatype, bounds, regions[0]);                        \
-    output_ptr = TensorAccessor<LEGION_WRITE_DISCARD, DIM>::access( \
-        args->datatype, bounds, regions[1]);                        \
-    break;                                                          \
-  }
-    LEGION_FOREACH_N(DIMFUNC)
-#undef DIMFUNC
-    default:
-      abort();
-  }
-  float alpha = 1.f, beta = 0.f;
-  // TODO: can we get away with CUDNN_SOFTMAX_FAST for inference?
-  CHECK_CUDNN(cudnnSoftmaxForward(
-      args->cudnn, CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL, &alpha,
-      args->inputTensor, input_ptr, &beta, args->outputTensor, output_ptr));
-  if (args->profiling) {
-#ifdef DISABLE_LEGION_CUDA_HIJACK
-    CHECK_CUDA(cudaEventRecord(t_end));
-#else
-    CHECK_CUDA(cudaEventRecord(t_start, stream));
-#endif
-    CHECK_CUDA(cudaEventSynchronize(t_end));
-    float elapsed = 0;
-    CHECK_CUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
-    CHECK_CUDA(cudaEventDestroy(t_start));
-    CHECK_CUDA(cudaEventDestroy(t_end));
-    printf(
-        "%s [Softmax] forward time (CF) = %.2fms\n",
-        args->owner->op_name.c_str(), elapsed);
-  }
-}
-#endif
-
-}}}  // namespace triton::backend::legion
diff --git a/triton/src/operators/softmax.h b/triton/src/operators/softmax.h
deleted file mode 100644
index 2748ecc1f0..0000000000
--- a/triton/src/operators/softmax.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __LEGION_TRITON_SOFTMAX_H__
-#define __LEGION_TRITON_SOFTMAX_H__
-
-#include "operator.h"
-#include "tensor.h"
-
-namespace triton { namespace backend { namespace legion {
-
-struct SoftmaxArgs : public OperatorArgs {
- public:
-  SoftmaxArgs(void);
-#ifdef LEGION_USE_CUDA
-  cudnnHandle_t cudnn;
-  cudnnTensorDescriptor_t inputTensor, outputTensor;
-#endif
-  unsigned local_index;
-  Legion::Domain bounds;
-  DataType datatype;
-  unsigned dim;
-};
-
-class Softmax : public Operator {
- public:
-  Softmax(
-      LegionModelState* model, const LayerStrategy* strategy, unsigned dim,
-      const char* name);
-
-  void Configure(Tensor* input, Tensor* output);
-  Legion::Domain GetBounds(Realm::Processor proc);
-
-  virtual void Load(Realm::Processor processor) override;
-  virtual void initialize(
-      LegionModelInstance* instance, const unsigned instance_index,
-      Legion::Runtime* runtime, Legion::Context ctx,
-      Legion::MapperID mapper) override;
-  virtual void forward(
-      LegionModelInstance* instance, const unsigned instance_index,
-      Legion::Runtime* runtime, Legion::Context ctx,
-      Legion::MapperID mapper) override;
-  virtual void finalize(
-      LegionModelInstance* instance, const unsigned instance_index,
-      Legion::Runtime* runtime, Legion::Context ctx,
-      Legion::MapperID mapper) override;
-  virtual void Free(Realm::Processor processor) override;
-
-  static void PreregisterTaskVariants(void);
-
-  static void forward_cpu(
-      const Legion::Task* task,
-      const std::vector<Legion::PhysicalRegion>& regions, Legion::Context ctx,
-      Legion::Runtime* runtime);
-
-#ifdef LEGION_USE_CUDA
-  static void forward_gpu(
-      const Legion::Task* task,
-      const std::vector<Legion::PhysicalRegion>& regions, Legion::Context ctx,
-      Legion::Runtime* runtime);
-#endif
- public:
-  const int dim;
-
- protected:
-  SoftmaxArgs args[MAX_LOCAL_PROCS];
-  Legion::FutureMap argmaps[MAX_NUM_INSTANCES];
-  Legion::IndexTaskLauncher launchers[MAX_NUM_INSTANCES];
-};
-
-}}}  // namespace triton::backend::legion
-
-#endif  // __LEGION_TRITON_SOFTMAX_H__
diff --git a/triton/src/operators/unary.cc b/triton/src/operators/unary.cc
deleted file mode 100644
index c0cdf8a371..0000000000
--- a/triton/src/operators/unary.cc
+++ /dev/null
@@ -1,386 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "unary.h"
-
-using namespace Legion;
-
-namespace triton { namespace backend { namespace legion {
-
-UnaryArgs::UnaryArgs(void) {}
-
-UnaryOperator::UnaryOperator(
-    LegionModelState* model, const LayerStrategy* strategy, OperatorType type,
-    const void* scalar_value, DataType stype, bool inplace, const char* name)
-    : Operator(model, strategy, type, name, 1, 0, 1), scalar_type(stype),
-      inplace(inplace)
-{
-  switch (stype) {
-    case DT_NONE:
-      break;
-    case DT_INT8: {
-      memcpy(&scalar.int8_value, scalar_value, sizeof(uint8_t));
-      break;
-    }
-    case DT_HALF: {
-      memcpy(&scalar.half_value, scalar_value, sizeof(__half));
-      break;
-    }
-    case DT_FLOAT: {
-      memcpy(&scalar.float_value, scalar_value, sizeof(float));
-      break;
-    }
-    case DT_DOUBLE: {
-      memcpy(&scalar.double_value, scalar_value, sizeof(double));
-      break;
-    }
-    default:
-      abort();
-  }
-}
-
-UnaryOperator::~UnaryOperator(void) {}
-
-void
-UnaryOperator::Configure(Tensor* input, Tensor* output)
-{
-  assert(input != nullptr);
-  assert(output != nullptr);
-  assert(input->type == scalar_type);
-  assert((op_type == OP_CAST) || (input->type == output->type));
-  assert(!inplace || (input == output));
-  // Make sure that they have the same bounds
-  assert(input->bounds.size() == output->bounds.size());
-  for (unsigned idx = 0; idx < input->bounds.size(); idx++)
-    assert(input->bounds[idx] == output->bounds[idx]);
-  inputs.push_back(input);
-  outputs.push_back(output);
-}
-
-Domain
-UnaryOperator::GetBounds(Processor proc)
-{
-  const size_t dims = outputs[0]->bounds.size();
-  DomainPoint lo, hi;
-  lo.dim = dims;
-  hi.dim = dims;
-  for (int d = 0; d < dims; d++) {
-    lo[d] = 0;
-    hi[d] = outputs[0]->bounds[d] - 1;
-  }
-  const Domain global(lo, hi);
-  return strategy->find_local_domain(proc, global);
-}
-
-void
-UnaryOperator::Load(Realm::Processor proc)
-{
-  assert(proc.kind() == strategy->kind);
-  // If this processor is not used for this layer there is nothing to do
-  if (!strategy->is_local_processor(proc))
-    return;
-  const unsigned local_index = strategy->find_local_offset(proc);
-  UnaryArgs& proc_args = args[local_index];
-  proc_args.owner = this;
-  proc_args.local_index = local_index;
-  proc_args.op_type = op_type;
-  proc_args.bounds = GetBounds(proc);
-  proc_args.datatype = scalar_type;
-  proc_args.casttype = outputs[0]->type;
-  proc_args.inplace = inplace;
-  switch (scalar_type) {
-    case DT_NONE:
-      break;
-    case DT_INT8: {
-      proc_args.scalar.int8_value = scalar.int8_value;
-      break;
-    }
-    case DT_HALF: {
-      proc_args.scalar.half_value = scalar.half_value;
-      break;
-    }
-    case DT_FLOAT: {
-      proc_args.scalar.float_value = scalar.float_value;
-      break;
-    }
-    case DT_DOUBLE: {
-      proc_args.scalar.double_value = scalar.double_value;
-      break;
-    }
-    default:
-      abort();
-  }
-#ifdef LEGION_USE_CUDA
-  if (proc.kind() == Processor::TOC_PROC) {
-    if (use_cudnn(op_type)) {
-      proc_args.cudnn = model->runtime_->cudnn[local_index];
-      CHECK_CUDNN(cudnnCreateTensorDescriptor(&proc_args.inputTensor));
-      CHECK_CUDNN(cudnnCreateTensorDescriptor(&proc_args.outputTensor));
-      CHECK_CUDNN(cudnnCreateActivationDescriptor(&proc_args.actiDesc));
-      cudnnActivationMode_t mode;
-      switch (op_type) {
-        case OP_SIGMOID: {
-          mode = CUDNN_ACTIVATION_SIGMOID;
-          break;
-        }
-        case OP_RELU: {
-          mode = CUDNN_ACTIVATION_RELU;
-          break;
-        }
-        case OP_TANH: {
-          mode = CUDNN_ACTIVATION_TANH;
-          break;
-        }
-        case OP_ELU: {
-          mode = CUDNN_ACTIVATION_ELU;
-          break;
-        }
-        default:
-          abort();
-      }
-      CHECK_CUDNN(cudnnSetActivationDescriptor(
-          proc_args.actiDesc, mode, CUDNN_PROPAGATE_NAN, 0.0));
-      CHECK_CUDNN(cudnnSetTensorDescriptorFromDomain(
-          proc_args.inputTensor, proc_args.bounds, inputs[0]->type));
-      CHECK_CUDNN(cudnnSetTensorDescriptorFromDomain(
-          proc_args.outputTensor, proc_args.bounds, outputs[0]->type));
-    }
-  }
-#endif
-}
-
-void
-UnaryOperator::initialize(
-    LegionModelInstance* instance, const unsigned instance_index,
-    Legion::Runtime* runtime, Legion::Context ctx, Legion::MapperID mapper)
-{
-  const Domain launch_domain = strategy->get_launch_domain();
-  // Find or create the launch space domain
-  IndexSpace launch_space = instance->find_or_create_index_space(launch_domain);
-  // Also get the sharding function from the strategy
-  ShardingFunction* shardfn = strategy->sharding_function;
-  // Construct a future map for the pass-by-value arguments
-  std::map<DomainPoint, TaskArgument> values;
-  for (Domain::DomainPointIterator itr(launch_domain); itr; itr++) {
-    const Processor proc = shardfn->find_proc(itr.p, launch_domain);
-    if (!strategy->is_local_processor(proc))
-      continue;
-    const unsigned local_index = strategy->find_local_offset(proc);
-    values[itr.p] = TaskArgument(args + local_index, sizeof(UnaryArgs));
-  }
-  argmaps[instance_index] = runtime->construct_future_map(
-      ctx, launch_space, values, true /*collective*/, shardfn->sharding_id);
-
-  IndexTaskLauncher& launcher = launchers[instance_index];
-  launcher = IndexTaskLauncher(
-      UNARY_TASK_ID, launch_space, TaskArgument(NULL, 0),
-      ArgumentMap(argmaps[instance_index]), Predicate::TRUE_PRED,
-      false /*must*/, mapper, strategy->tag);
-  LogicalRegion input_region = inputs[0]->region[instance_index];
-  if (inplace) {
-    LogicalPartition part =
-        instance->find_or_create_tiled_partition(inputs[0], strategy);
-    launcher.add_region_requirement(RegionRequirement(
-        part, 0 /*projection id*/, LEGION_READ_WRITE, LEGION_EXCLUSIVE,
-        input_region));
-    launcher.add_field(0, FID_DATA);
-  } else {
-    // Create a logical region for the output data
-    assert(outputs.size() == 1);
-    LogicalRegion output_region = instance->create_tensor_region(outputs[0]);
-
-    // Create partitions for the regions
-    LogicalPartition input_part =
-        instance->find_or_create_tiled_partition(inputs[0], strategy);
-    LogicalPartition output_part =
-        instance->find_or_create_tiled_partition(outputs[0], strategy);
-    launcher.add_region_requirement(RegionRequirement(
-        input_part, 0 /*projection id*/, LEGION_READ_ONLY, LEGION_EXCLUSIVE,
-        input_region));
-    launcher.add_field(0, FID_DATA);
-    launcher.add_region_requirement(RegionRequirement(
-        output_part, 0 /*projection id*/, LEGION_WRITE_DISCARD,
-        LEGION_EXCLUSIVE, output_region));
-    launcher.add_field(1, FID_DATA);
-  }
-}
-
-void
-UnaryOperator::forward(
-    LegionModelInstance* instance, const unsigned instance_index,
-    Legion::Runtime* runtime, Legion::Context ctx, Legion::MapperID mapper)
-{
-  runtime->execute_index_space(ctx, launchers[instance_index]);
-}
-
-void
-UnaryOperator::finalize(
-    LegionModelInstance* instance, const unsigned instance_index,
-    Legion::Runtime* runtime, Legion::Context ctx, Legion::MapperID mapper)
-{
-  argmaps[instance_index] = FutureMap();
-}
-
-void
-UnaryOperator::Free(Realm::Processor proc)
-{
-  assert(proc.kind() == strategy->kind);
-  // If this processor is not used for this layer there is nothing to do
-  if (!strategy->is_local_processor(proc))
-    return;
-#ifdef LEGION_USE_CUDA
-  if ((proc.kind() == Processor::TOC_PROC) && use_cudnn(op_type)) {
-    const unsigned local_index = strategy->find_local_offset(proc);
-    UnaryArgs& proc_args = args[local_index];
-    CHECK_CUDNN(cudnnDestroyTensorDescriptor(proc_args.inputTensor));
-    CHECK_CUDNN(cudnnDestroyTensorDescriptor(proc_args.outputTensor));
-    CHECK_CUDNN(cudnnDestroyActivationDescriptor(proc_args.actiDesc));
-  }
-#endif
-}
-
-/*static*/ void
-UnaryOperator::PreregisterTaskVariants(void)
-{
-  {
-    TaskVariantRegistrar cpu_registrar(UNARY_TASK_ID, "Unary CPU");
-    cpu_registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    cpu_registrar.set_leaf();
-    Runtime::preregister_task_variant<forward_cpu>(
-        cpu_registrar, "Unary Operator");
-  }
-#ifdef LEGION_USE_CUDA
-  {
-    TaskVariantRegistrar gpu_registrar(UNARY_TASK_ID, "Unary GPU");
-    gpu_registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
-    gpu_registrar.set_leaf();
-    Runtime::preregister_task_variant<forward_gpu>(
-        gpu_registrar, "Unary Operator");
-  }
-#endif
-}
-
-/*static*/ void
-UnaryOperator::forward_cpu(
-    const Task* task, const std::vector<PhysicalRegion>& regions, Context ctx,
-    Runtime* runtime)
-{
-  // TODO: implement this
-  assert(false);
-}
-
-#ifdef LEGION_USE_CUDA
-/*static*/ void
-UnaryOperator::forward_gpu(
-    const Legion::Task* task,
-    const std::vector<Legion::PhysicalRegion>& regions, Legion::Context ctx,
-    Legion::Runtime* runtime)
-{
-  assert(task->local_arglen == sizeof(UnaryArgs));
-  const UnaryArgs* args = (const UnaryArgs*)task->local_args;
-#ifndef DISABLE_LEGION_CUDA_HIJACK
-  ::cudaStream_t stream;
-  CHECK_CUDA(cudaStreamCreate(&stream));
-  if (use_cudnn(args->op_type)) {
-    CHECK_CUDNN(cudnnSetStream(args->cudnn, stream));
-  }
-#endif
-  ::cudaEvent_t t_start, t_end;
-  if (args->profiling) {
-    CHECK_CUDA(cudaEventCreate(&t_start));
-    CHECK_CUDA(cudaEventCreate(&t_end));
-#ifdef DISABLE_LEGION_CUDA_HIJACK
-    CHECK_CUDA(cudaEventRecord(t_start));
-#else
-    CHECK_CUDA(cudaEventRecord(t_start, stream));
-#endif
-  }
-  if (args->inplace) {
-    assert(regions.size() == 1);
-    assert(task->regions.size() == 1);
-    void* inout_ptr = nullptr;
-    size_t volume = 0;
-    switch (args->bounds.get_dim()) {
-#define DIMFUNC(DIM)                                            \
-  case DIM: {                                                   \
-    const Rect<DIM> bounds = args->bounds;                      \
-    volume = bounds.volume();                                   \
-    inout_ptr = TensorAccessor<LEGION_READ_WRITE, DIM>::access( \
-        args->datatype, bounds, regions[0]);                    \
-    break;                                                      \
-  }
-      LEGION_FOREACH_N(DIMFUNC)
-#undef DIMFUNC
-      default:
-        abort();
-    }
-    forward_kernel(args, stream, inout_ptr, inout_ptr, volume);
-  } else {
-    assert(regions.size() == 2);
-    assert(task->regions.size() == 2);
-    const void* input_ptr = nullptr;
-    void* output_ptr = nullptr;
-    size_t volume = 0;
-    switch (args->bounds.get_dim()) {
-#define DIMFUNC(DIM)                                                \
-  case DIM: {                                                       \
-    const Rect<DIM> bounds = args->bounds;                          \
-    volume = bounds.volume();                                       \
-    input_ptr = TensorAccessor<LEGION_READ_ONLY, DIM>::access(      \
-        args->datatype, bounds, regions[0]);                        \
-    output_ptr = TensorAccessor<LEGION_WRITE_DISCARD, DIM>::access( \
-        args->datatype, bounds, regions[1]);                        \
-    break;                                                          \
-  }
-      LEGION_FOREACH_N(DIMFUNC)
-#undef DIMFUNC
-      default:
-        abort();
-    }
-    forward_kernel(args, stream, input_ptr, output_ptr, volume);
-  }
-  if (args->profiling) {
-#ifdef DISABLE_LEGION_CUDA_HIJACK
-    CHECK_CUDA(cudaEventRecord(t_end));
-#else
-    CHECK_CUDA(cudaEventRecord(t_start, stream));
-#endif
-    CHECK_CUDA(cudaEventSynchronize(t_end));
-    float elapsed = 0;
-    CHECK_CUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
-    CHECK_CUDA(cudaEventDestroy(t_start));
-    CHECK_CUDA(cudaEventDestroy(t_end));
-    printf(
-        "%s [Unary] forward time (CF) = %.2fms\n", args->owner->op_name.c_str(),
-        elapsed);
-  }
-}
-
-/*static*/ bool
-UnaryOperator::use_cudnn(OperatorType optype)
-{
-  if (optype == OP_RELU)
-    return true;
-  if (optype == OP_SIGMOID)
-    return true;
-  if (optype == OP_TANH)
-    return true;
-  if (optype == OP_ELU)
-    return true;
-  return false;
-}
-#endif
-
-}}}  // namespace triton::backend::legion
diff --git a/triton/src/operators/unary.cu b/triton/src/operators/unary.cu
deleted file mode 100644
index ef4c77f8e4..0000000000
--- a/triton/src/operators/unary.cu
+++ /dev/null
@@ -1,462 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "unary.h"
-
-#include "mathtypes/half.h"
-
-using namespace Legion;
-
-namespace triton { namespace backend { namespace legion {
-
-template <typename TI, typename TO>
-__global__ static void
-gpu_forward_cast(const TI* input, TO* output, const size_t volume)
-{
-  const size_t offset = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
-  if (offset >= volume)
-    return;
-  output[offset] = (TO)input[offset];
-}
-
-// Some unfortunate specializations because the compiler can't figure
-// out the best intermedidate type to convert half types to
-template <>
-__global__ void
-gpu_forward_cast<__half, int8_t>(
-    const __half* input, int8_t* output, const size_t volume)
-{
-  const size_t offset = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
-  if (offset >= volume)
-    return;
-  output[offset] = (short)input[offset];
-}
-
-template <>
-__global__ void
-gpu_forward_cast<__half, int64_t>(
-    const __half* input, int64_t* output, const size_t volume)
-{
-  const size_t offset = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
-  if (offset >= volume)
-    return;
-  output[offset] = (long long)input[offset];
-}
-
-template <>
-__global__ void
-gpu_forward_cast<int64_t, __half>(
-    const int64_t* input, __half* output, const size_t volume)
-{
-  const size_t offset = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
-  if (offset >= volume)
-    return;
-  output[offset] = (long long)input[offset];
-}
-
-template <>
-__global__ void
-gpu_forward_cast<__half, uint8_t>(
-    const __half* input, uint8_t* output, const size_t volume)
-{
-  const size_t offset = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
-  if (offset >= volume)
-    return;
-  output[offset] = (unsigned short)input[offset];
-}
-
-template <>
-__global__ void
-gpu_forward_cast<__half, uint64_t>(
-    const __half* input, uint64_t* output, const size_t volume)
-{
-  const size_t offset = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
-  if (offset >= volume)
-    return;
-  output[offset] = (unsigned long long)input[offset];
-}
-
-template <>
-__global__ void
-gpu_forward_cast<uint64_t, __half>(
-    const uint64_t* input, __half* output, const size_t volume)
-{
-  const size_t offset = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
-  if (offset >= volume)
-    return;
-  output[offset] = (unsigned long long)input[offset];
-}
-
-__global__ static void
-unary_forward_half(
-    const __half* input, __half* output, const __half alpha, const __half beta,
-    const __half scalar, const OperatorType optype, const size_t volume)
-{
-  const size_t offset = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
-  if (offset >= volume)
-    return;
-  switch (optype) {
-    case OP_EXP: {
-      output[offset] = alpha * hexp(input[offset]) + beta * output[offset];
-      break;
-    }
-    case OP_LOG: {
-      output[offset] = alpha * hlog(input[offset]) + beta * output[offset];
-      break;
-    }
-    case OP_SQRT: {
-      output[offset] = alpha * hsqrt(input[offset]) + beta * output[offset];
-      break;
-    }
-    case OP_IDENTITY: {
-      output[offset] = input[offset];
-      break;
-    }
-    case OP_SCALAR_MULTIPLY: {
-      output[offset] = input[offset] * scalar;
-      break;
-    }
-    case OP_SCALAR_ADD: {
-      output[offset] = input[offset] + scalar;
-      break;
-    }
-    case OP_SCALAR_SUB: {
-      output[offset] = input[offset] - scalar;
-      break;
-    }
-    case OP_SCALAR_TRUE_DIV: {
-      output[offset] = input[offset] / scalar;
-      break;
-    }
-    case OP_GELU: {
-      output[offset] = __hmul(
-          __hmul(input[offset], 0.5f),
-          erfcf(__hmul(-input[offset], M_SQRT1_2)));
-      break;
-    }
-    case OP_RECIPROCAL: {
-      output[offset] = __hdiv(__half(1.f), input[offset]);
-      break;
-    }
-    default:
-      break;
-  }
-}
-
-__global__ static void
-unary_forward_float(
-    const float* input, float* output, const float alpha, const float beta,
-    const float scalar, const OperatorType optype, const size_t volume)
-{
-  const size_t offset = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
-  if (offset >= volume)
-    return;
-  switch (optype) {
-    case OP_EXP: {
-      output[offset] = alpha * expf(input[offset]) + beta * output[offset];
-      break;
-    }
-    case OP_LOG: {
-      output[offset] = alpha * logf(input[offset]) + beta * output[offset];
-      break;
-    }
-    case OP_SQRT: {
-      output[offset] = alpha * sqrtf(input[offset]) + beta * output[offset];
-      break;
-    }
-    case OP_IDENTITY: {
-      output[offset] = input[offset];
-      break;
-    }
-    case OP_SCALAR_MULTIPLY: {
-      output[offset] = input[offset] * scalar;
-      break;
-    }
-    case OP_SCALAR_ADD: {
-      output[offset] = input[offset] + scalar;
-      break;
-    }
-    case OP_SCALAR_SUB: {
-      output[offset] = input[offset] - scalar;
-      break;
-    }
-    case OP_SCALAR_TRUE_DIV: {
-      output[offset] = input[offset] / scalar;
-      break;
-    }
-    case OP_GELU: {
-      output[offset] = input[offset] * 0.5f * erfc(-input[offset] * M_SQRT1_2);
-      break;
-    }
-    case OP_RECIPROCAL: {
-      output[offset] = 1.f / input[offset];
-      break;
-    }
-    default:
-      break;
-  }
-}
-
-__global__ static void
-unary_forward_double(
-    const double* input, double* output, const double alpha, const double beta,
-    const double scalar, const OperatorType optype, const size_t volume)
-{
-  const size_t offset = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
-  if (offset >= volume)
-    return;
-  switch (optype) {
-    case OP_EXP: {
-      output[offset] = alpha * exp(input[offset]) + beta * output[offset];
-      break;
-    }
-    case OP_LOG: {
-      output[offset] = alpha * log(input[offset]) + beta * output[offset];
-      break;
-    }
-    case OP_SQRT: {
-      output[offset] = alpha * sqrt(input[offset]) + beta * output[offset];
-      break;
-    }
-    case OP_IDENTITY: {
-      output[offset] = input[offset];
-      break;
-    }
-    case OP_SCALAR_MULTIPLY: {
-      output[offset] = input[offset] * scalar;
-      break;
-    }
-    case OP_SCALAR_ADD: {
-      output[offset] = input[offset] + scalar;
-      break;
-    }
-    case OP_SCALAR_SUB: {
-      output[offset] = input[offset] - scalar;
-      break;
-    }
-    case OP_SCALAR_TRUE_DIV: {
-      output[offset] = input[offset] / scalar;
-      break;
-    }
-    case OP_GELU: {
-      output[offset] = input[offset] * 0.5 * erfc(-input[offset] * M_SQRT1_2);
-      break;
-    }
-    case OP_RECIPROCAL: {
-      output[offset] = 1.0 / input[offset];
-      break;
-    }
-    default:
-      break;
-  }
-}
-
-template <typename T>
-__host__ static void
-forward_cast(
-    DataType output_type, ::cudaStream_t stream, const void* input_ptr,
-    void* output_ptr, size_t num_elements)
-{
-  const size_t blocks =
-      (num_elements + (THREADS_PER_BLOCK - 1)) / THREADS_PER_BLOCK;
-  switch (output_type) {
-    case DT_HALF: {
-      gpu_forward_cast<T, __half><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
-          (const T*)input_ptr, (__half*)output_ptr, num_elements);
-      break;
-    }
-    case DT_FLOAT: {
-      gpu_forward_cast<T, float><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
-          (const T*)input_ptr, (float*)output_ptr, num_elements);
-      break;
-    }
-    case DT_DOUBLE: {
-      gpu_forward_cast<T, double><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
-          (const T*)input_ptr, (double*)output_ptr, num_elements);
-      break;
-    }
-    case DT_INT8: {
-      gpu_forward_cast<T, int8_t><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
-          (const T*)input_ptr, (int8_t*)output_ptr, num_elements);
-      break;
-    }
-    case DT_INT16: {
-      gpu_forward_cast<T, int16_t><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
-          (const T*)input_ptr, (int16_t*)output_ptr, num_elements);
-      break;
-    }
-    case DT_INT32: {
-      gpu_forward_cast<T, int32_t><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
-          (const T*)input_ptr, (int32_t*)output_ptr, num_elements);
-      break;
-    }
-    case DT_INT64: {
-      gpu_forward_cast<T, int64_t><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
-          (const T*)input_ptr, (int64_t*)output_ptr, num_elements);
-      break;
-    }
-    case DT_UINT8: {
-      gpu_forward_cast<T, uint8_t><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
-          (const T*)input_ptr, (uint8_t*)output_ptr, num_elements);
-      break;
-    }
-    case DT_UINT16: {
-      gpu_forward_cast<T, uint16_t><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
-          (const T*)input_ptr, (uint16_t*)output_ptr, num_elements);
-      break;
-    }
-    case DT_UINT32: {
-      gpu_forward_cast<T, uint32_t><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
-          (const T*)input_ptr, (uint32_t*)output_ptr, num_elements);
-      break;
-    }
-    case DT_UINT64: {
-      gpu_forward_cast<T, uint64_t><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
-          (const T*)input_ptr, (uint64_t*)output_ptr, num_elements);
-      break;
-    }
-    case DT_BOOLEAN: {
-      gpu_forward_cast<T, bool><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
-          (const T*)input_ptr, (bool*)output_ptr, num_elements);
-      break;
-    }
-    default:
-      abort();
-  }
-}
-
-__host__
-    /*static*/ void
-    UnaryOperator::forward_kernel(
-        const UnaryArgs* args, ::cudaStream_t stream, const void* input_ptr,
-        void* output_ptr, size_t num_elements)
-{
-  if (args->op_type == OP_CAST) {
-    switch (args->datatype) {
-      case DT_HALF: {
-        forward_cast<__half>(
-            args->casttype, stream, input_ptr, output_ptr, num_elements);
-        break;
-      }
-      case DT_FLOAT: {
-        forward_cast<float>(
-            args->casttype, stream, input_ptr, output_ptr, num_elements);
-        break;
-      }
-      case DT_DOUBLE: {
-        forward_cast<double>(
-            args->casttype, stream, input_ptr, output_ptr, num_elements);
-        break;
-      }
-      case DT_INT8: {
-        forward_cast<int8_t>(
-            args->casttype, stream, input_ptr, output_ptr, num_elements);
-        break;
-      }
-      case DT_INT16: {
-        forward_cast<int16_t>(
-            args->casttype, stream, input_ptr, output_ptr, num_elements);
-        break;
-      }
-      case DT_INT32: {
-        forward_cast<int32_t>(
-            args->casttype, stream, input_ptr, output_ptr, num_elements);
-        break;
-      }
-      case DT_INT64: {
-        forward_cast<int64_t>(
-            args->casttype, stream, input_ptr, output_ptr, num_elements);
-        break;
-      }
-      case DT_UINT8: {
-        forward_cast<uint8_t>(
-            args->casttype, stream, input_ptr, output_ptr, num_elements);
-        break;
-      }
-      case DT_UINT16: {
-        forward_cast<uint16_t>(
-            args->casttype, stream, input_ptr, output_ptr, num_elements);
-        break;
-      }
-      case DT_UINT32: {
-        forward_cast<uint32_t>(
-            args->casttype, stream, input_ptr, output_ptr, num_elements);
-        break;
-      }
-      case DT_UINT64: {
-        forward_cast<uint64_t>(
-            args->casttype, stream, input_ptr, output_ptr, num_elements);
-        break;
-      }
-      case DT_BOOLEAN: {
-        forward_cast<bool>(
-            args->casttype, stream, input_ptr, output_ptr, num_elements);
-        break;
-      }
-      default:
-        abort();
-    }
-  } else if (use_cudnn(args->op_type)) {
-    if (args->datatype == DT_DOUBLE) {
-      double alpha = 1.0, beta = 0.0;
-      CHECK_CUDNN(cudnnActivationForward(
-          args->cudnn, args->actiDesc, &alpha, args->inputTensor, input_ptr,
-          &beta, args->outputTensor, output_ptr));
-    } else {
-      float alpha = 1.f, beta = 0.f;
-      CHECK_CUDNN(cudnnActivationForward(
-          args->cudnn, args->actiDesc, &alpha, args->inputTensor, input_ptr,
-          &beta, args->outputTensor, output_ptr));
-    }
-  } else {
-    const size_t blocks =
-        (num_elements + (THREADS_PER_BLOCK - 1)) / THREADS_PER_BLOCK;
-    assert(
-        (args->op_type == OP_EXP) || (args->op_type == OP_LOG) ||
-        (args->op_type == OP_SQRT) || (args->op_type == OP_IDENTITY) ||
-        (args->op_type == OP_SCALAR_MULTIPLY) ||
-        (args->op_type == OP_SCALAR_ADD) || (args->op_type == OP_SCALAR_SUB) ||
-        (args->op_type == OP_SCALAR_TRUE_DIV) || (args->op_type == OP_GELU) ||
-        (OP_RECIPROCAL));
-    switch (args->datatype) {
-      case DT_HALF: {
-        __half alpha = 1.f, beta = 0.f;
-        unary_forward_half<<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
-            (const __half*)input_ptr, (__half*)output_ptr, alpha, beta,
-            args->scalar.half_value, args->op_type, num_elements);
-        break;
-      }
-      case DT_FLOAT: {
-        float alpha = 1.f, beta = 0.f;
-        unary_forward_float<<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
-            (const float*)input_ptr, (float*)output_ptr, alpha, beta,
-            args->scalar.float_value, args->op_type, num_elements);
-        break;
-      }
-      case DT_DOUBLE: {
-        double alpha = 1.0, beta = 0.0;
-        unary_forward_double<<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
-            (const double*)input_ptr, (double*)output_ptr, alpha, beta,
-            args->scalar.double_value, args->op_type, num_elements);
-        break;
-      }
-      default:
-        // TODO support for other data types like int8
-        abort();
-    }
-  }
-}
-
-}}}  // namespace triton::backend::legion
diff --git a/triton/src/operators/unary.h b/triton/src/operators/unary.h
deleted file mode 100644
index 33723d90a4..0000000000
--- a/triton/src/operators/unary.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __LEGION_TRITON_UNARY_H__
-#define __LEGION_TRITON_UNARY_H__
-
-#include "operator.h"
-#include "tensor.h"
-
-namespace triton { namespace backend { namespace legion {
-
-class UnaryOperator;
-
-struct UnaryArgs : public OperatorArgs {
- public:
-  UnaryArgs(void);
-#ifdef LEGION_USE_CUDA
-  cudnnHandle_t cudnn;
-  cudnnTensorDescriptor_t inputTensor, outputTensor;
-  cudnnActivationDescriptor_t actiDesc;
-#endif
-  unsigned local_index;
-  OperatorType op_type;
-  Legion::Domain bounds;
-  DataType datatype;
-  DataType casttype;
-  union {
-    int8_t int8_value;
-    __half half_value;
-    float float_value;
-    double double_value;
-  } scalar;
-  bool inplace;
-};
-
-class UnaryOperator : public Operator {
- public:
-  UnaryOperator(
-      LegionModelState* model, const LayerStrategy* strategy, OperatorType type,
-      const void* scalar, DataType scalar_type, bool inplace, const char* name);
-  virtual ~UnaryOperator(void);
-
-  void Configure(Tensor* input, Tensor* output);
-  Legion::Domain GetBounds(Realm::Processor proc);
-
-  virtual void Load(Realm::Processor processor) override;
-  virtual void initialize(
-      LegionModelInstance* instance, const unsigned instance_index,
-      Legion::Runtime* runtime, Legion::Context ctx,
-      Legion::MapperID mapper) override;
-  virtual void forward(
-      LegionModelInstance* instance, const unsigned instance_index,
-      Legion::Runtime* runtime, Legion::Context ctx,
-      Legion::MapperID mapper) override;
-  virtual void finalize(
-      LegionModelInstance* instance, const unsigned instance_index,
-      Legion::Runtime* runtime, Legion::Context ctx,
-      Legion::MapperID mapper) override;
-  virtual void Free(Realm::Processor processor) override;
-
- public:
-  static void PreregisterTaskVariants(void);
-  static void forward_cpu(
-      const Legion::Task* task,
-      const std::vector<Legion::PhysicalRegion>& regions, Legion::Context ctx,
-      Legion::Runtime* runtime);
-#ifdef LEGION_USE_CUDA
- public:
-  static void forward_gpu(
-      const Legion::Task* task,
-      const std::vector<Legion::PhysicalRegion>& regions, Legion::Context ctx,
-      Legion::Runtime* runtime);
-
- protected:
-  static bool use_cudnn(OperatorType optype);
-  static void forward_kernel(
-      const UnaryArgs* args, ::cudaStream_t stream, const void* input_ptr,
-      void* output_ptr, size_t num_elements);
-#endif
- public:
-  const DataType scalar_type;
-  union {
-    int8_t int8_value;
-    __half half_value;
-    float float_value;
-    double double_value;
-  } scalar;
-  const bool inplace;
-
- protected:
-  UnaryArgs args[MAX_LOCAL_PROCS];
-  Legion::FutureMap argmaps[MAX_NUM_INSTANCES];
-  Legion::IndexTaskLauncher launchers[MAX_NUM_INSTANCES];
-};
-
-}}}  // namespace triton::backend::legion
-
-#endif  // __LEGION_TRITON_UNARY_H__
diff --git a/triton/src/runtime.cc b/triton/src/runtime.cc
deleted file mode 100644
index 91025cf2d8..0000000000
--- a/triton/src/runtime.cc
+++ /dev/null
@@ -1,799 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "runtime.h"
-#include "instance.h"
-#include "legion/legion_utilities.h"
-#include "model.h"
-#include "operator.h"
-
-using namespace Legion;
-
-namespace triton { namespace backend { namespace legion {
-
-TRITONSERVER_Error*
-LegionTritonRuntime::Create(Legion::TaskID ttid, LegionTritonRuntime** runtime)
-{
-  Machine machine = Machine::get_machine();
-  // Find our local utility processor first
-  Processor local_proc;
-  std::vector<Processor> local_cpus, local_gpus;
-  {
-    Machine::ProcessorQuery query(machine);
-    query.only_kind(Processor::LOC_PROC /*CPU*/);
-    query.local_address_space();
-    size_t count = query.count();
-    if (count == 0)
-      return TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_UNKNOWN,
-          "Unable to find any Realm CPU processors");
-    if (count > 1) {
-      local_cpus.reserve(count);
-      for (Machine::ProcessorQuery::iterator it = query.begin();
-           it != query.end(); it++)
-        local_cpus.push_back(*it);
-      local_proc = ProcessorGroup::create_group(local_cpus);
-    } else {
-      local_proc = query.first();
-      local_cpus.push_back(local_proc);
-    }
-  }
-  const AddressSpaceID local_space = local_proc.address_space();
-  // Find a remote utility processor to use as well for each node
-  std::vector<Processor> remote_procs, all_cpus, all_gpus;
-  {
-    Machine::ProcessorQuery query(machine);
-    query.only_kind(Processor::LOC_PROC /*CPU*/);
-    std::map<AddressSpaceID, Processor> unique_spaces;
-    all_cpus.reserve(query.count());
-    for (Machine::ProcessorQuery::iterator it = query.begin();
-         it != query.end(); it++) {
-      all_cpus.push_back(*it);
-      AddressSpaceID space = it->address_space();
-      if (space == local_space)
-        continue;
-      if (unique_spaces.find(space) != unique_spaces.end())
-        continue;
-      unique_spaces[space] = *it;
-    }
-    // plus one because we did not include oursevles
-    remote_procs.resize(unique_spaces.size() + 1, Processor::NO_PROC);
-    for (auto it = unique_spaces.begin(); it != unique_spaces.end(); it++)
-      remote_procs[it->first] = it->second;
-  }
-  {
-    Machine::ProcessorQuery query(machine);
-    query.only_kind(Processor::TOC_PROC /*GPU*/);
-    all_gpus.reserve(query.count());
-    for (Machine::ProcessorQuery::iterator it = query.begin();
-         it != query.end(); it++) {
-      all_gpus.push_back(*it);
-      if (it->address_space() == local_space)
-        local_gpus.push_back(*it);
-    }
-  }
-  *runtime = new LegionTritonRuntime(
-      Runtime::get_runtime(), ttid, local_space, remote_procs.size(),
-      local_proc, std::move(remote_procs), std::move(all_cpus),
-      std::move(all_gpus), local_gpus);
-  // Register our tasks with Realm for handling messages
-  std::vector<Realm::Event> ready_events;
-  Realm::ProfilingRequestSet no_requests;
-  CodeDescriptor message_desc(LegionTritonRuntime::InstanceMessageTask);
-  CodeDescriptor context_desc(LegionTritonRuntime::CreateContextTask);
-  CodeDescriptor inference_desc(LegionTritonRuntime::RunModelInferenceTask);
-  CodeDescriptor load_layer_desc(LegionTritonRuntime::LoadLayerTask);
-  CodeDescriptor free_layer_desc(LegionTritonRuntime::FreeLayerTask);
-  for (auto proc : local_cpus) {
-    Realm::Event registered = proc.register_task(
-        INSTANCE_CREATE_TASK_ID, message_desc, no_requests, runtime,
-        sizeof(LegionTritonRuntime*));
-    if (registered.exists())
-      ready_events.push_back(registered);
-    registered = proc.register_task(
-        CONTEXT_CREATE_TASK_ID, context_desc, no_requests, runtime,
-        sizeof(LegionTritonRuntime*));
-    if (registered.exists())
-      ready_events.push_back(registered);
-    registered = proc.register_task(
-        RUN_MODEL_INFERENCE_TASK_ID, inference_desc, no_requests, runtime,
-        sizeof(LegionTritonRuntime*));
-    if (registered.exists())
-      ready_events.push_back(registered);
-    registered = proc.register_task(
-        LOAD_LAYER_TASK_ID, load_layer_desc, no_requests, runtime,
-        sizeof(LegionTritonRuntime*));
-    if (registered.exists())
-      ready_events.push_back(registered);
-    registered = proc.register_task(
-        FREE_LAYER_TASK_ID, free_layer_desc, no_requests, runtime,
-        sizeof(LegionTritonRuntime*));
-    if (registered.exists())
-      ready_events.push_back(registered);
-  }
-  // also need to register layer tasks on GPUs as well
-  CodeDescriptor init_cuda_desc(LegionTritonRuntime::InitCudaTask);
-  unsigned gpu_index = 0;
-  for (auto proc : local_gpus) {
-    Realm::Event registered = proc.register_task(
-        INIT_CUDALIBS_TASK_ID, init_cuda_desc, no_requests, runtime,
-        sizeof(LegionTritonRuntime*));
-    // Also launch the task to initialize the cuda libraries
-    registered = proc.spawn(
-        INIT_CUDALIBS_TASK_ID, &gpu_index, sizeof(gpu_index), registered);
-    gpu_index++;
-    if (!registered.exists())
-      ready_events.push_back(registered);
-    registered = proc.register_task(
-        LOAD_LAYER_TASK_ID, load_layer_desc, no_requests, runtime,
-        sizeof(LegionTritonRuntime*));
-    if (registered.exists())
-      ready_events.push_back(registered);
-    registered = proc.register_task(
-        FREE_LAYER_TASK_ID, free_layer_desc, no_requests, runtime,
-        sizeof(LegionTritonRuntime*));
-    if (registered.exists())
-      ready_events.push_back(registered);
-  }
-  // Do a Realm barrier here to make sure everyone is done registering
-  Realm::Runtime realm = Realm::Runtime::get_runtime();
-  if (!ready_events.empty())
-    realm
-        .collective_spawn_by_kind(
-            Realm::Processor::LOC_PROC /*CPU*/,
-            Realm::Processor::TASK_ID_PROCESSOR_NOP, nullptr, 0,
-            true /*one per node*/, Realm::Event::merge_events(ready_events))
-        .external_wait();
-  else
-    realm
-        .collective_spawn_by_kind(
-            Realm::Processor::LOC_PROC /*CPU*/,
-            Realm::Processor::TASK_ID_PROCESSOR_NOP, nullptr, 0,
-            true /*one per node*/)
-        .external_wait();
-
-  return nullptr;
-}
-
-LegionTritonRuntime::LegionTritonRuntime(
-    Legion::Runtime* lg, Legion::TaskID ttid, Legion::AddressSpaceID rank,
-    size_t total, Processor local, std::vector<Processor>&& remote,
-    std::vector<Processor>&& cpus, std::vector<Processor>&& gpus,
-    const std::vector<Processor>& local_gpus, bool allowTensorOpMathConv)
-    : legion_(lg), top_task_id_(ttid), rank_(rank), total_ranks_(total),
-      local_proc_(local), local_sysmem_(FindLocalSysmem()),
-      local_regmem_(FindLocalRegmem()), remote_procs_(std::move(remote)),
-      all_cpus_(std::move(cpus)), all_gpus_(std::move(gpus)),
-      local_cpus_(FindLocal(rank_, all_cpus_)),
-      local_gpus_(FindLocal(rank_, all_gpus_)),
-      local_framebuffers_(FindLocalFramebuffers(local_gpus)),
-      allowTensorOpMathConversion_(allowTensorOpMathConv)
-{
-  creation_barrier_ = Realm::Barrier::NO_BARRIER;
-  for (unsigned idx = 0; idx < local_gpus.size(); idx++)
-    gpu_lookup_[local_gpus[idx]] = idx;
-}
-
-/*static*/ std::vector<Realm::Processor>
-LegionTritonRuntime::FindLocal(
-    AddressSpaceID local, const std::vector<Realm::Processor>& procs)
-{
-  std::vector<Realm::Processor> result;
-  for (auto proc : procs)
-    if (proc.address_space() == local)
-      result.push_back(proc);
-  return result;
-}
-
-/*static*/ Realm::Memory
-LegionTritonRuntime::FindLocalSysmem(void)
-{
-  Machine machine = Machine::get_machine();
-  Machine::MemoryQuery local_sysmem(machine);
-  local_sysmem.local_address_space();
-  local_sysmem.only_kind(Memory::SYSTEM_MEM);
-  assert(local_sysmem.count() == 1);
-  return local_sysmem.first();
-}
-
-/*static*/ Realm::Memory
-LegionTritonRuntime::FindLocalRegmem(void)
-{
-  Machine machine = Machine::get_machine();
-  Machine::MemoryQuery local_regmem(machine);
-  local_regmem.local_address_space();
-  local_regmem.only_kind(Memory::REGDMA_MEM);
-  assert(local_regmem.count() <= 1);
-  if (local_regmem.count() == 0)
-    return Memory::NO_MEMORY;
-  return local_regmem.first();
-}
-
-/*static*/ std::vector<Realm::Memory>
-LegionTritonRuntime::FindLocalFramebuffers(
-    const std::vector<Realm::Processor>& gpus)
-{
-  Machine machine = Machine::get_machine();
-  std::vector<Realm::Memory> framebuffers(gpus.size());
-  for (unsigned idx = 0; idx < gpus.size(); idx++) {
-    // Now we can find our framebuffer
-    Machine::MemoryQuery local_fb(machine);
-    local_fb.only_kind(Memory::GPU_FB_MEM);
-    local_fb.best_affinity_to(gpus[idx]);
-    assert(local_fb.count() >= 1);
-    framebuffers[idx] = local_fb.first();
-  }
-  return framebuffers;
-}
-
-void
-LegionTritonRuntime::RecordModel(LegionModelState* model)
-{
-  AutoLock<true> lock(lock_);
-  models_.push_back(model);
-}
-
-void
-LegionTritonRuntime::RemoveModel(LegionModelState* model)
-{
-  AutoLock<true> lock(lock_);
-  for (auto it = models_.begin(); it != models_.end(); it++) {
-    if ((*it) != model)
-      continue;
-    models_.erase(it);
-    break;
-  }
-}
-
-void
-LegionTritonRuntime::RendezvousContextCreation(
-    LegionModelInstance* instance, Realm::UserEvent ready)
-{
-  // If we're not rank 0 send the message there
-  if (rank_ > 0) {
-    Serializer rez;
-    const size_t length = instance->model_state_->name.size();
-    rez.serialize(length);
-    rez.serialize(instance->model_state_->name.c_str(), length);
-    rez.serialize(instance->model_state_->version);
-    rez.serialize(instance->index_);
-    rez.serialize(local_proc_);
-    rez.serialize(instance);
-    rez.serialize(ready);
-    remote_procs_[0].spawn(
-        INSTANCE_CREATE_TASK_ID, rez.get_buffer(), rez.get_used_bytes(),
-        Realm::Event::NO_EVENT, INT_MAX /*high priority*/);
-  } else
-    HandleContextCreation(
-        instance->model_state_->name, instance->model_state_->version,
-        instance->index_, local_proc_, instance, ready);
-}
-
-unsigned
-LegionTritonRuntime::FindGPUIndex(Processor proc) const
-{
-  std::map<Processor, unsigned>::const_iterator finder = gpu_lookup_.find(proc);
-  assert(finder != gpu_lookup_.end());
-  return finder->second;
-}
-
-const std::vector<Processor>&
-LegionTritonRuntime::FindAllProcessors(Processor::Kind kind)
-{
-  switch (kind) {
-    case Processor::LOC_PROC:
-      return all_cpus_;
-    case Processor::TOC_PROC:
-      return all_gpus_;
-    default:
-      abort();
-  }
-  return all_cpus_;
-}
-
-const std::vector<Processor>&
-LegionTritonRuntime::FindLocalProcessors(Processor::Kind kind)
-{
-  switch (kind) {
-    case Processor::LOC_PROC:
-      return local_cpus_;
-    case Processor::TOC_PROC:
-      return local_gpus_;
-    default:
-      abort();
-  }
-  return local_cpus_;
-}
-
-Memory
-LegionTritonRuntime::FindMemory(
-    TRITONSERVER_MemoryType type, uint64_t device_id)
-{
-  switch (type) {
-    case TRITONSERVER_MEMORY_CPU: {
-      assert(local_sysmem_.exists());
-      return local_sysmem_;
-    }
-    case TRITONSERVER_MEMORY_CPU_PINNED: {
-      assert(local_regmem_.exists());
-      return local_regmem_;
-    }
-    case TRITONSERVER_MEMORY_GPU: {
-      // Hopefully Triton counts device IDs the same way as Realm does
-      assert(device_id < local_framebuffers_.size());
-      return local_framebuffers_[device_id];
-    }
-    default:
-      assert(false);
-  }
-  return Memory::NO_MEMORY;
-}
-
-Realm::Event
-LegionTritonRuntime::LoadLayer(Processor proc, Operator* op)
-{
-  return proc.spawn(
-      LOAD_LAYER_TASK_ID, &op, sizeof(op), Realm::Event::NO_EVENT,
-      INT_MAX /*high priority*/);
-}
-
-Realm::Event
-LegionTritonRuntime::FreeLayer(Processor proc, Operator* op)
-{
-  return proc.spawn(
-      FREE_LAYER_TASK_ID, &op, sizeof(op), Realm::Event::NO_EVENT,
-      INT_MAX /*high priority*/);
-}
-
-void
-LegionTritonRuntime::HandleContextCreation(
-    const std::string& name, uint64_t version, unsigned index,
-    Realm::Processor source, LegionModelInstance* instance,
-    Realm::UserEvent ready, bool external, bool need_lock)
-{
-  // Should only be here on rank 0
-  assert(rank_ == 0);
-  if (need_lock) {
-    if (external) {
-      AutoLock<true> lock(lock_);
-      HandleContextCreation(
-          name, version, index, source, instance, ready, true, false);
-    } else {
-      AutoLock<false> lock(lock_);
-      HandleContextCreation(
-          name, version, index, source, instance, ready, false, false);
-    }
-    return;
-  }
-  // Now we've got the lock
-  for (auto it = pending_contexts_.begin(); it != pending_contexts_.end();
-       it++) {
-    if (!it->matches(name, version, index))
-      continue;
-    assert(it->requests.find(source) == it->requests.end());
-    it->requests[source] = std::make_pair(instance, ready);
-    // If we've seen all the requests we know we're ready for the rendezvous
-    if (it->requests.size() == total_ranks_) {
-      CreatePendingContext(*it);
-      pending_contexts_.erase(it);
-    }
-    return;
-  }
-  if (total_ranks_ > 1) {
-    pending_contexts_.emplace_back(PendingContext(name, version, index));
-    pending_contexts_.back().requests[source] = std::make_pair(instance, ready);
-  } else {
-    // Special case of single process execution
-    PendingContext pending(name, version, index);
-    pending.requests[source] = std::make_pair(instance, ready);
-    CreatePendingContext(pending);
-  }
-}
-
-void
-LegionTritonRuntime::CreatePendingContext(const PendingContext& pending)
-{
-  assert(pending.requests.size() == total_ranks_);
-  // Get a barrier for coordinating ordering of the creations
-  Realm::Barrier bar;
-  // Also get the precondition from the previous creation
-  Realm::Event precondition;
-  if (creation_barrier_.exists()) {
-    bar = creation_barrier_;
-    precondition = creation_barrier_.get_previous_phase();
-  } else {
-    creation_barrier_ = Realm::Barrier::create_barrier(total_ranks_);
-    bar = creation_barrier_;
-    precondition = Realm::Event::NO_EVENT;
-  }
-  creation_barrier_ = creation_barrier_.advance_barrier();
-  // TODO: we assert if we run out of barrier generations here
-  // If we make more than 2^12 contexts at any point this could be an issue
-  // Subject to changes in Realm's setting for default barrier generations
-  assert(creation_barrier_.exists());
-  // Launch tasks to make the contexts on all the nodes
-  // TODO: build a broadcast tree to do this for more scalability
-  // For node counts less than a few hundred we should be alright
-  for (auto it = pending.requests.begin(); it != pending.requests.end(); it++) {
-    Serializer rez;
-    rez.serialize(it->second.first);
-    rez.serialize(bar);
-    it->first.spawn(
-        CONTEXT_CREATE_TASK_ID, rez.get_buffer(), rez.get_used_bytes(),
-        precondition, INT_MAX /*high priority*/);
-    // We know the context will be ready when the barrier completes
-    it->second.second.trigger(bar);
-  }
-}
-
-/*static*/ void
-LegionTritonRuntime::InstanceMessageTask(
-    const void* args, size_t arglen, const void* data, size_t datalen,
-    Realm::Processor p)
-{
-  assert(datalen == sizeof(LegionTritonRuntime*));
-  LegionTritonRuntime* runtime = *((LegionTritonRuntime**)data);
-  Deserializer derez(args, arglen);
-  size_t length;
-  derez.deserialize(length);
-  std::string name((const char*)derez.get_current_pointer(), length);
-  derez.advance_pointer(length);
-  uint64_t version;
-  derez.deserialize(version);
-  unsigned index;
-  derez.deserialize(index);
-  Processor source;
-  derez.deserialize(source);
-  LegionModelInstance* instance;
-  derez.deserialize(instance);
-  Realm::UserEvent ready;
-  derez.deserialize(ready);
-  assert(!derez.get_remaining_bytes());
-
-  runtime->HandleContextCreation(
-      name, version, index, source, instance, ready, false /*external*/);
-}
-
-/*static*/ void
-LegionTritonRuntime::CreateContextTask(
-    const void* args, size_t arglen, const void* data, size_t datalen,
-    Realm::Processor p)
-{
-  assert(datalen == sizeof(LegionTritonRuntime*));
-  LegionTritonRuntime* runtime = *((LegionTritonRuntime**)data);
-  Deserializer derez(args, arglen);
-  LegionModelInstance* instance;
-  derez.deserialize(instance);
-  Realm::Barrier barrier;
-  derez.deserialize(barrier);
-  assert(!derez.get_remaining_bytes());
-
-  // Create the context
-  instance->CreateContext(
-      runtime->legion_, runtime->top_task_id_, runtime->rank_,
-      runtime->total_ranks_, barrier,
-      (runtime->InstanceOwner(instance->index_) == runtime->rank_));
-  // Arrive on the barrier signaling that we are done
-  barrier.arrive();
-}
-
-AddressSpaceID
-LegionTritonRuntime::InstanceOwner(unsigned instance_index) const
-{
-  // Round robin instances across the nodes so we get some load balance
-  // of responsibilities for scheduling work on different instances
-  // TODO: a better hasing scheme here for load balance that incorporates
-  // the model names and versions as well
-  return (instance_index % total_ranks_);
-}
-
-LegionModelInstance*
-LegionTritonRuntime::FindModelInstance(
-    const std::string& model_name, uint64_t model_version,
-    unsigned instance_index, bool external, bool need_lock)
-{
-  if (need_lock) {
-    if (external) {
-      AutoLock<true> lock(lock_, false /*exclusive*/);
-      return FindModelInstance(
-          model_name, model_version, instance_index, true, false);
-    } else {
-      AutoLock<false> lock(lock_, false /*exclusive*/);
-      return FindModelInstance(
-          model_name, model_version, instance_index, false, false);
-    }
-  }
-  for (auto model : models_) {
-    if (model_name != model->name)
-      continue;
-    if (model_version != model->version)
-      continue;
-    return model->FindInstance(instance_index, external);
-  }
-  // should never get here
-  assert(false);
-  return NULL;
-}
-
-void
-LegionTritonRuntime::DistributeRunModel(
-    const std::string& model_name, uint64_t model_version,
-    unsigned instance_index, const std::vector<InputTensor>& inputs,
-    const std::vector<OutputTensor>& outputs,
-    std::vector<uint64_t>& compute_input_end_ns,
-    std::vector<uint64_t>& compute_output_start_ns, AddressSpaceID source,
-    LegionModelInstance* instance, Realm::Barrier barrier,
-    Realm::UserEvent pretrigger, Realm::UserEvent posttrigger)
-{
-  Realm::Event precondition = Realm::Event::NO_EVENT;
-  const AddressSpaceID owner_rank = InstanceOwner(instance_index);
-  if (owner_rank != rank_) {
-    assert(!pretrigger.exists());
-    assert(!posttrigger.exists());
-    // Check to see if this is already our broadcast message
-    if (source == rank_) {
-      assert(!barrier.exists());
-      // Send the message to the owner rank to handle this
-      pretrigger = Realm::UserEvent::create_user_event();
-      posttrigger = Realm::UserEvent::create_user_event();
-      Serializer rez;
-      PackRunModel(
-          rez, model_name, model_version, instance_index, inputs, outputs);
-      rez.serialize(Realm::Barrier::NO_BARRIER);
-      rez.serialize(pretrigger);
-      rez.serialize(posttrigger);
-      rez.serialize(rank_);
-      remote_procs_[owner_rank].spawn(
-          RUN_MODEL_INFERENCE_TASK_ID, rez.get_buffer(), rez.get_used_bytes(),
-          Realm::Event::NO_EVENT, INT_MAX);
-      precondition = pretrigger;
-    } else {
-      // This came from the broadcast node so we can just run it
-      assert(barrier.exists());
-      assert(!pretrigger.exists());
-      assert(!posttrigger.exists());
-      // No precondition as it was added to the launch
-    }
-  } else {
-    assert(!barrier.exists());
-    if (instance == NULL)
-      instance = FindModelInstance(
-          model_name, model_version, instance_index, (source == rank_));
-    barrier = instance->GetExecutionBarrier(
-        total_ranks_, precondition, (source == rank_));
-    Serializer rez;
-    PackRunModel(
-        rez, model_name, model_version, instance_index, inputs, outputs);
-    rez.serialize(barrier);
-    rez.serialize(Realm::UserEvent::NO_USER_EVENT);
-    rez.serialize(Realm::UserEvent::NO_USER_EVENT);
-    rez.serialize(rank_);
-    // Broadcast out the request to all the other ranks
-    for (AddressSpaceID rank = 0; rank < total_ranks_; rank++) {
-      if (rank == rank_)
-        continue;
-      if (rank == source) {
-        // No need to send a message back to the source
-        assert(pretrigger.exists());
-        assert(posttrigger.exists());
-        pretrigger.trigger(precondition);
-        barrier.arrive(1 /*count*/, posttrigger);
-      } else
-        remote_procs_[rank].spawn(
-            RUN_MODEL_INFERENCE_TASK_ID, rez.get_buffer(), rez.get_used_bytes(),
-            precondition, INT_MAX);
-    }
-  }
-  // Find the instance we are running
-  if (instance == NULL)
-    instance = FindModelInstance(
-        model_name, model_version, instance_index, (source == rank_));
-  // If we have a precondition, wait for that before we start the call
-  if (precondition.exists()) {
-    if (source == rank_)
-      precondition.external_wait();
-    else
-      precondition.wait();
-  }
-  // Run the model
-  instance->RunModel(
-      inputs, outputs, compute_input_end_ns, compute_output_start_ns,
-      true /*distributed*/);
-  if (!barrier.exists()) {
-    assert(posttrigger.exists());
-    posttrigger.trigger();
-  } else
-    barrier.arrive();
-}
-
-/*static*/ void
-LegionTritonRuntime::PackRunModel(
-    Serializer& rez, const std::string& model_name, uint64_t model_version,
-    unsigned instance_index, const std::vector<InputTensor>& inputs,
-    const std::vector<OutputTensor>& outputs)
-{
-  const size_t length = model_name.size();
-  rez.serialize(length);
-  rez.serialize(model_name.c_str(), length);
-  rez.serialize(model_version);
-  rez.serialize(instance_index);
-  rez.serialize<size_t>(inputs.size());
-  for (auto& tensor : inputs) {
-    const size_t namelen = tensor.name_.size();
-    rez.serialize(namelen);
-    rez.serialize(tensor.name_.c_str(), namelen);
-    rez.serialize<size_t>(tensor.buffers_.size());
-    for (auto ptr : tensor.buffers_) rez.serialize(ptr);
-    assert(tensor.buffers_.size() == tensor.buffer_locations_.size());
-    for (auto& loc : tensor.buffer_locations_) {
-      rez.serialize(loc.first);
-      rez.serialize(loc.second);
-    }
-    assert(tensor.buffers_.size() == tensor.buffer_memories_.size());
-    for (auto& mem : tensor.buffer_memories_) rez.serialize(mem);
-    rez.serialize<size_t>(tensor.strides_.size());
-    for (auto stride : tensor.strides_) rez.serialize(stride);
-  }
-  rez.serialize<size_t>(outputs.size());
-  for (auto& tensor : outputs) {
-    const size_t namelen = tensor.name_.size();
-    rez.serialize(namelen);
-    rez.serialize(tensor.name_.c_str(), namelen);
-    rez.serialize<size_t>(tensor.buffers_.size());
-    for (auto ptr : tensor.buffers_) rez.serialize(ptr);
-    assert(tensor.buffers_.size() == tensor.buffer_locations_.size());
-    for (auto& loc : tensor.buffer_locations_) {
-      rez.serialize(loc.first);
-      rez.serialize(loc.second);
-    }
-    assert(tensor.buffers_.size() == tensor.buffer_memories_.size());
-    for (auto& mem : tensor.buffer_memories_) rez.serialize(mem);
-    rez.serialize<size_t>(tensor.strides_.size());
-    for (auto stride : tensor.strides_) rez.serialize(stride);
-  }
-}
-
-/*static*/ void
-LegionTritonRuntime::RunModelInferenceTask(
-    const void* args, size_t arglen, const void* data, size_t datalen,
-    Realm::Processor p)
-{
-  assert(datalen == sizeof(LegionTritonRuntime*));
-  LegionTritonRuntime* runtime = *((LegionTritonRuntime**)data);
-  Deserializer derez(args, arglen);
-  size_t length;
-  derez.deserialize(length);
-  std::string model_name((const char*)derez.get_current_pointer(), length);
-  derez.advance_pointer(length);
-  uint64_t model_version;
-  derez.deserialize(model_version);
-  unsigned instance_index;
-  derez.deserialize(instance_index);
-  size_t num_inputs;
-  derez.deserialize(num_inputs);
-  std::vector<InputTensor> inputs(num_inputs);
-  for (unsigned idx1 = 0; idx1 < num_inputs; idx1++) {
-    InputTensor& tensor = inputs[idx1];
-    size_t namelen;
-    derez.deserialize(namelen);
-    tensor.name_ =
-        std::string((const char*)derez.get_current_pointer(), namelen);
-    derez.advance_pointer(namelen);
-    size_t num_buffers;
-    derez.deserialize(num_buffers);
-    tensor.buffers_.resize(num_buffers);
-    for (unsigned idx2 = 0; idx2 < num_buffers; idx2++)
-      derez.deserialize(tensor.buffers_[idx2]);
-    tensor.buffer_locations_.resize(num_buffers);
-    for (unsigned idx2 = 0; idx2 < num_buffers; idx2++) {
-      auto& pair = tensor.buffer_locations_[idx2];
-      derez.deserialize(pair.first);
-      derez.deserialize(pair.second);
-    }
-    tensor.buffer_memories_.resize(num_buffers);
-    for (unsigned idx2 = 0; idx2 < num_buffers; idx2++)
-      derez.deserialize(tensor.buffer_memories_[idx2]);
-    size_t num_strides;
-    derez.deserialize(num_strides);
-    tensor.strides_.resize(num_strides);
-    for (unsigned idx2 = 0; idx2 < num_strides; idx2++)
-      derez.deserialize(tensor.strides_[idx2]);
-  }
-  size_t num_outputs;
-  derez.deserialize(num_outputs);
-  std::vector<OutputTensor> outputs(num_outputs);
-  for (unsigned idx1 = 0; idx1 < num_outputs; idx1++) {
-    OutputTensor& tensor = outputs[idx1];
-    size_t namelen;
-    derez.deserialize(namelen);
-    tensor.name_ =
-        std::string((const char*)derez.get_current_pointer(), namelen);
-    derez.advance_pointer(namelen);
-    size_t num_buffers;
-    derez.deserialize(num_buffers);
-    tensor.buffers_.resize(num_buffers);
-    for (unsigned idx2 = 0; idx2 < num_buffers; idx2++)
-      derez.deserialize(tensor.buffers_[idx2]);
-    tensor.buffer_locations_.resize(num_buffers);
-    for (unsigned idx2 = 0; idx2 < num_buffers; idx2++) {
-      auto& pair = tensor.buffer_locations_[idx2];
-      derez.deserialize(pair.first);
-      derez.deserialize(pair.second);
-    }
-    tensor.buffer_memories_.resize(num_buffers);
-    for (unsigned idx2 = 0; idx2 < num_buffers; idx2++)
-      derez.deserialize(tensor.buffer_memories_[idx2]);
-    size_t num_strides;
-    derez.deserialize(num_strides);
-    tensor.strides_.resize(num_strides);
-    for (unsigned idx2 = 0; idx2 < num_strides; idx2++)
-      derez.deserialize(tensor.strides_[idx2]);
-  }
-  Realm::Barrier barrier;
-  derez.deserialize(barrier);
-  Realm::UserEvent pretrigger, posttrigger;
-  derez.deserialize(pretrigger);
-  derez.deserialize(posttrigger);
-  AddressSpaceID source;
-  derez.deserialize(source);
-  assert(!derez.get_remaining_bytes());
-
-  std::vector<uint64_t> dummy_input_timing, dummy_output_timing;
-  runtime->DistributeRunModel(
-      model_name, model_version, instance_index, inputs, outputs,
-      dummy_input_timing, dummy_output_timing, source,
-      NULL /*unknown instance*/, barrier, pretrigger, posttrigger);
-}
-
-/*static*/ void
-LegionTritonRuntime::InitCudaTask(
-    const void* args, size_t arglen, const void* data, size_t datalen,
-    Realm::Processor p)
-{
-#ifdef LEGION_USE_CUDA
-  assert(arglen == sizeof(unsigned));
-  const unsigned index = *((const unsigned*)args);
-  assert(datalen == sizeof(LegionTritonRuntime*));
-  LegionTritonRuntime* runtime = *((LegionTritonRuntime**)data);
-  CHECK_CUDNN(cudnnCreate(&(runtime->cudnn[index])));
-  CHECK_CUBLAS(cublasCreate(&(runtime->cublas[index])));
-#else
-  abort();
-#endif
-}
-
-/*static*/ void
-LegionTritonRuntime::LoadLayerTask(
-    const void* args, size_t arglen, const void* data, size_t datalen,
-    Realm::Processor p)
-{
-  assert(arglen == sizeof(Operator**));
-  assert(datalen == sizeof(LegionTritonRuntime*));
-  Operator* op = *((Operator**)args);
-  op->Load(p);
-}
-
-/*static*/ void
-LegionTritonRuntime::FreeLayerTask(
-    const void* args, size_t arglen, const void* data, size_t datalen,
-    Realm::Processor p)
-{
-  assert(arglen == sizeof(Operator**));
-  assert(datalen == sizeof(LegionTritonRuntime*));
-  Operator* op = *((Operator**)args);
-  op->Free(p);
-}
-
-}}}  // namespace triton::backend::legion
diff --git a/triton/src/runtime.h b/triton/src/runtime.h
deleted file mode 100644
index b32bced3c2..0000000000
--- a/triton/src/runtime.h
+++ /dev/null
@@ -1,221 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __LEGION_TRITON_RUNTIME_H__
-#define __LEGION_TRITON_RUNTIME_H__
-
-#include "config.h"
-#include "legion.h"
-#include "triton/backend/backend_common.h"
-#include "types.h"
-#ifdef LEGION_USE_CUDA
-#include "cudahelp.h"
-#endif
-
-namespace triton { namespace backend { namespace legion {
-
-//
-// Legion Triton Runtime
-// This is a small runtime built to facilitate the glue operations
-// between requests coming in from Triton and running inference jobs
-// in Legion. It provides facilities for hooking up model instances
-// across the differant processes so that they are all aligned even
-// if models are loaded out of order. It also helps order requests
-// to the different model instances coming from different nodes.
-// Despite its name, its actually a Realm runtime since it works
-// in parallel with Legion on top of Realm.
-//
-class LegionTritonRuntime {
- public:
-  enum {
-    // TODO: This is a bit of a hack in that we are "guessing" about
-    // where Legion is going to register task variants with Realm
-    // utility processors. In practice, Legion would need to register
-    // 1M task variants with utility processors before it conflicted
-    // with these task IDs. Legion registers at most a few task variants
-    // with utility processors so conflicts should never happen in practice.
-    // It would be nice to do this in a way that doesn't rely on having
-    // knowledge of Legion's internals though.
-    INSTANCE_CREATE_TASK_ID = 1 << 20,
-    CONTEXT_CREATE_TASK_ID,
-    RUN_MODEL_INFERENCE_TASK_ID,
-    INIT_CUDALIBS_TASK_ID,
-    LOAD_LAYER_TASK_ID,
-    FREE_LAYER_TASK_ID,
-  };
-  struct PendingContext {
-   public:
-    PendingContext(const std::string& name, uint64_t v, unsigned idx)
-        : model_name(name), version(v), index(idx)
-    {
-    }
-    inline bool matches(const std::string& name, uint64_t v, unsigned idx) const
-    {
-      if (name != model_name)
-        return false;
-      if (version != v)
-        return false;
-      if (index != idx)
-        return false;
-      return true;
-    }
-
-   public:
-    std::string model_name;
-    uint64_t version;
-    unsigned index;
-    std::map<
-        Realm::Processor, std::pair<LegionModelInstance*, Realm::UserEvent> >
-        requests;
-  };
-
- public:
-  static TRITONSERVER_Error* Create(
-      Legion::TaskID ttid, LegionTritonRuntime** runtime);
-
- private:
-  LegionTritonRuntime(
-      Legion::Runtime* lg, Legion::TaskID ttid, Legion::AddressSpaceID rank,
-      size_t total, Realm::Processor local,
-      std::vector<Realm::Processor>&& remote,
-      std::vector<Realm::Processor>&& cpus,
-      std::vector<Realm::Processor>&& gpus,
-      const std::vector<Realm::Processor>& local_gpus,
-      bool allowTensorOpMathConversion = true);
-
- public:
-  void RecordModel(LegionModelState* model);
-  void RemoveModel(LegionModelState* model);
-  void RendezvousContextCreation(
-      LegionModelInstance* instance, Realm::UserEvent ready);
-  void DistributeRunModel(
-      const std::string& model_name, uint64_t model_version,
-      unsigned instance_index, const std::vector<InputTensor>& inputs,
-      const std::vector<OutputTensor>& outputs,
-      std::vector<uint64_t>& compute_input_end_ns,
-      std::vector<uint64_t>& compute_output_start_ns,
-      Legion::AddressSpaceID source, LegionModelInstance* instance = NULL,
-      Realm::Barrier barrier = Realm::Barrier::NO_BARRIER,
-      Realm::UserEvent pretrigger = Realm::UserEvent::NO_USER_EVENT,
-      Realm::UserEvent posttrigger = Realm::UserEvent::NO_USER_EVENT);
-
- public:
-  unsigned FindGPUIndex(Realm::Processor proc) const;
-  const std::vector<Realm::Processor>& FindAllProcessors(
-      Realm::Processor::Kind kind);
-  const std::vector<Realm::Processor>& FindLocalProcessors(
-      Realm::Processor::Kind kind);
-  Realm::Memory FindMemory(TRITONSERVER_MemoryType type, uint64_t device_id);
-  Realm::Event LoadLayer(Realm::Processor proc, Operator* op);
-  Realm::Event FreeLayer(Realm::Processor proc, Operator* op);
-
- protected:
-  void HandleContextCreation(
-      const std::string& name, uint64_t version, unsigned index,
-      Realm::Processor source, LegionModelInstance* instance,
-      Realm::UserEvent ready, bool external = true, bool need_lock = true);
-  void CreatePendingContext(const PendingContext& pending);
-  Legion::AddressSpaceID InstanceOwner(unsigned instance_index) const;
-  LegionModelInstance* FindModelInstance(
-      const std::string& model_name, uint64_t model_version,
-      unsigned instance_index, bool external, bool need_lock = true);
-
- public:
-  static void InstanceMessageTask(
-      const void* args, size_t arglen, const void* userdata, size_t userlen,
-      Realm::Processor p);
-  static void CreateContextTask(
-      const void* args, size_t arglen, const void* userdata, size_t userlen,
-      Realm::Processor p);
-  static void RunModelInferenceTask(
-      const void* args, size_t arglen, const void* userdata, size_t userlen,
-      Realm::Processor p);
-  static void InitCudaTask(
-      const void* args, size_t arglen, const void* userdata, size_t userlen,
-      Realm::Processor p);
-  static void LoadLayerTask(
-      const void* args, size_t arglen, const void* userdata, size_t userlen,
-      Realm::Processor p);
-  static void FreeLayerTask(
-      const void* args, size_t arglen, const void* userdata, size_t userlen,
-      Realm::Processor p);
-
- protected:
-  static std::vector<Realm::Processor> FindLocal(
-      Legion::AddressSpaceID local, const std::vector<Realm::Processor>& procs);
-  static Realm::Memory FindLocalSysmem(void);
-  static Realm::Memory FindLocalRegmem(void);
-  static std::vector<Realm::Memory> FindLocalFramebuffers(
-      const std::vector<Realm::Processor>& gpus);
-  static void PackRunModel(
-      Legion::Serializer& rez, const std::string& model_name,
-      uint64_t model_version, unsigned instance_index,
-      const std::vector<InputTensor>& inputs,
-      const std::vector<OutputTensor>& outputs);
-
- public:
-  Legion::Runtime* const legion_;
-  const Legion::TaskID top_task_id_;
-  const Legion::AddressSpaceID rank_;
-  const size_t total_ranks_;
-  const Realm::Processor local_proc_;
-  const Realm::Memory local_sysmem_;
-  const Realm::Memory local_regmem_;
-  const std::vector<Realm::Processor> remote_procs_;
-  const std::vector<Realm::Processor> all_cpus_;
-  const std::vector<Realm::Processor> all_gpus_;
-  const std::vector<Realm::Processor> local_cpus_;
-  const std::vector<Realm::Processor> local_gpus_;
-  const std::vector<Realm::Memory> local_framebuffers_;
-  const bool allowTensorOpMathConversion_;
-#ifdef LEGION_USE_CUDA
- public:
-  cudnnHandle_t cudnn[MAX_LOCAL_PROCS];
-  cublasHandle_t cublas[MAX_LOCAL_PROCS];
-#endif
- private:
-  Realm::FastReservation lock_;
-  std::vector<LegionModelState*> models_;
-
- private:
-  std::list<PendingContext> pending_contexts_;
-  Realm::Barrier creation_barrier_;
-  std::map<Realm::Processor, unsigned> gpu_lookup_;
-};
-
-// A small helper class for using Realm's fast reservation
-// synchronization primitive in external threads
-template <bool EXTERNAL>
-class AutoLock {
- public:
-  AutoLock(Realm::FastReservation& lock, bool exclusive = true) : lock_(lock)
-  {
-    const Realm::Event wait_on = exclusive ? lock_.wrlock() : lock_.rdlock();
-    if (wait_on.exists() && !wait_on.has_triggered()) {
-      if (EXTERNAL)
-        wait_on.external_wait();
-      else
-        wait_on.wait();
-    }
-  }
-  ~AutoLock(void) { lock_.unlock(); }
-
- private:
-  Realm::FastReservation& lock_;
-};
-
-}}}  // namespace triton::backend::legion
-
-#endif  // __LEGION_TRITON_RUNTIME_H__
diff --git a/triton/src/strategy.cc b/triton/src/strategy.cc
deleted file mode 100644
index 28c0fddcd7..0000000000
--- a/triton/src/strategy.cc
+++ /dev/null
@@ -1,1884 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "strategy.h"
-#include <fstream>
-#include <iostream>
-#include <string>
-#include "model.h"
-#include "runtime.h"
-
-using namespace Legion;
-using namespace Legion::Mapping;
-
-namespace triton { namespace backend { namespace legion {
-
-Logger log_triton("triton");
-
-ShardingFunction::ShardingFunction(ShardingID sid, const LayerStrategy* s)
-    : sharding_id(sid), strategy(s)
-{
-}
-
-Processor
-ShardingFunction::find_proc(const DomainPoint& point, const Domain& full_space)
-{
-  size_t offset = 0;
-  const int dims = point.get_dim();
-  // We transposed the dimensions when loading the partitioning
-  // strategy file, so we need to transpose what order we walk
-  // the dimensions when looking for the processor
-  assert(dims > 0);
-  for (int d = dims - 1; d >= 0; d--) {
-    size_t pitch = full_space.hi()[d] - full_space.lo()[d] + 1;
-    offset = offset * pitch + point[d] - full_space.lo()[d];
-  }
-  assert(offset < strategy->global_processors.size());
-  return strategy->global_processors[offset];
-}
-
-ShardID
-ShardingFunction::shard(
-    const DomainPoint& point, const Domain& full_space,
-    const size_t total_shards)
-{
-  const Processor proc = find_proc(point, full_space);
-  return proc.address_space();
-}
-
-LayerStrategy::LayerStrategy(ShardingID sid, MappingTagID t, Runtime* runtime)
-    : sharding_function(new ShardingFunction(sid, this)), tag(t)
-{
-  // Register this sharding functor with legion
-  runtime->register_sharding_functor(
-      sid, sharding_function, true /*silence warnings*/);
-}
-
-LayerStrategy::~LayerStrategy(void)
-{
-  // TODO: tell legion to unregister the sharding function
-}
-
-Domain
-LayerStrategy::get_launch_domain(void) const
-{
-  DomainPoint lo, hi;
-  lo.dim = nDims;
-  hi.dim = nDims;
-  for (int i = 0; i < nDims; i++) {
-    lo[i] = 0;
-    // Legion domains are inclusive
-    assert(dim[i] > 0);
-    hi[i] = dim[i] - 1;
-  }
-  return Domain(lo, hi);
-}
-
-Domain
-LayerStrategy::find_local_domain(
-    Processor proc, const Legion::Domain& global) const
-{
-  const DomainPoint local_point = find_local_point(proc);
-  const int dims = local_point.get_dim();
-  assert(dims == global.get_dim());
-  DomainPoint lo, hi;
-  lo.dim = dims;
-  hi.dim = dims;
-  for (int d = 0; d < dims; d++) {
-    // this will round up so we tile the entire space
-    const coord_t tile = (global.hi()[d] - global.lo()[d] + dim[d]) / dim[d];
-    lo[d] = local_point[d] * tile;
-    hi[d] = (local_point[d] + 1) * tile - 1;
-    // clamp to the upper bound space
-    if (hi[d] > global.hi()[d])
-      hi[d] = global.hi()[d];
-  }
-  return Domain(lo, hi);
-}
-
-bool
-LayerStrategy::is_local_processor(Processor proc) const
-{
-  for (unsigned idx = 0; idx < nProcs; idx++)
-    if (local_processors[idx] == proc)
-      return true;
-  return false;
-}
-
-unsigned
-LayerStrategy::find_local_offset(Processor proc) const
-{
-  for (unsigned idx = 0; idx < nProcs; idx++)
-    if (local_processors[idx] == proc)
-      return idx;
-  abort();
-  return 0;
-}
-
-DomainPoint
-LayerStrategy::find_local_point(Realm::Processor proc) const
-{
-  for (unsigned idx = 0; idx < nProcs; idx++)
-    if (local_processors[idx] == proc)
-      return local_points[idx];
-  abort();
-  return DomainPoint();
-}
-
-/*static*/ PartitionStrategy*
-PartitionStrategy::LoadStrategy(
-    const std::string& filename, LegionModelState* model)
-{
-  std::fstream input(filename, std::ios::in);
-  if (!input) {
-    std::cerr << "Failed to open strategy file for reading" << std::endl;
-    abort();
-  }
-
-  int ops_size = 0;
-  input >> ops_size;
-
-  LegionTritonRuntime* runtime = model->runtime_;
-  // Allocate sharding function IDs for this model
-  // We generate a unique string name for this model by concatenating
-  // its name with its version number
-  const std::string unique_name = model->name + std::to_string(model->version);
-  ShardingID first_id = runtime->legion_->generate_library_sharding_ids(
-      unique_name.c_str(), ops_size);
-  std::vector<const LayerStrategy*> layers(ops_size);
-  for (int i = 0; i < ops_size; i++) {
-    LayerStrategy* layer = new LayerStrategy(first_id + i, i, runtime->legion_);
-    char op_name[64];  // hard-coded size from flexflow
-    input >> op_name;
-    int device_type;
-    input >> device_type;
-    switch (device_type) {
-      // These are hard-coded from FlexFlow source code
-      case 0:
-        layer->kind = Processor::TOC_PROC;
-        break;
-      case 1:
-        layer->kind = Processor::LOC_PROC;
-        break;
-      default:
-        fprintf(stderr, "Unsupported Device Type %d\n", device_type);
-        abort();
-    }
-    input >> layer->nDims;
-    assert(layer->nDims > 0);
-    int n = 1;
-    // Note: we transpose the dimensions here from how FlexFlow represents
-    // them because we keep our dimensions aligned with ONNX, e.g. NCHW
-    for (int j = layer->nDims - 1; j >= 0; j--) {
-      input >> layer->dim[j];
-      n = n * layer->dim[j];
-    }
-    int device_ids_size = 0;
-    input >> device_ids_size;
-    assert(n == device_ids_size || device_ids_size == 0);
-    const std::vector<Processor>& all_procs =
-        runtime->FindAllProcessors(layer->kind);
-    layer->nProcs = 0;
-    layer->global_processors.resize(device_ids_size);
-    for (int j = 0; j < device_ids_size; j++) {
-      int device_id;
-      input >> device_id;
-      assert(device_id >= 0);
-      if (unsigned(device_id) >= all_procs.size()) {
-        const char* proc_names[] = {
-#define PROC_NAMES(name, desc) desc,
-            REALM_PROCESSOR_KINDS(PROC_NAMES)
-#undef MEM_NAMES
-        };
-        std::cerr << "Insufficient " << proc_names[layer->kind]
-                  << " processors for partitioning strategy " << filename
-                  << std::endl;
-        abort();
-      }
-      const Processor proc = all_procs[device_id];
-      layer->global_processors[j] = proc;
-      // check to see if it is a local processor
-      if (proc.address_space() == runtime->rank_) {
-        assert(layer->nProcs < MAX_LOCAL_PROCS);
-        layer->local_processors[layer->nProcs++] = proc;
-      }
-    }
-    // Sanity check, compute the mapping of points in the launch domain
-    // to local processors so that we can easily invert them later
-    Domain launch_domain = layer->get_launch_domain();
-    ShardingFunction* function = layer->sharding_function;
-    unsigned found_count = 0;
-    for (Domain::DomainPointIterator itr(launch_domain); itr; itr++) {
-      const Processor proc = function->find_proc(itr.p, launch_domain);
-      if (proc.address_space() != runtime->rank_)
-        continue;
-      bool found = false;
-      for (unsigned idx = 0; idx < layer->nProcs; idx++) {
-        if (layer->local_processors[idx] != proc)
-          continue;
-        layer->local_points[idx] = itr.p;
-        found = true;
-        break;
-      }
-      assert(found);
-      found_count++;
-    }
-    // Should have found all of them
-    assert(found_count == layer->nProcs);
-    layers[i] = layer;
-  }
-  input.close();
-  return new PartitionStrategy(model, std::move(layers));
-}
-
-PartitionStrategy::~PartitionStrategy(void)
-{
-  for (auto layer : layers) delete layer;
-}
-
-StrategyMapper::StrategyMapper(
-    const PartitionStrategy* s, Mapping::MapperRuntime* rt, Machine m)
-    : Mapper(rt), strategy(s), machine(m), local_node(get_local_node()),
-      total_nodes(get_total_nodes(m)), mapper_name(create_name(local_node))
-{
-  // Query to find all our local processors
-  Machine::ProcessorQuery local_procs(machine);
-  local_procs.local_address_space();
-  for (Machine::ProcessorQuery::iterator it = local_procs.begin();
-       it != local_procs.end(); it++) {
-    switch (it->kind()) {
-      case Processor::LOC_PROC: {
-        local_cpus.push_back(*it);
-        break;
-      }
-      case Processor::TOC_PROC: {
-        local_gpus.push_back(*it);
-        break;
-      }
-      case Processor::OMP_PROC: {
-        local_omps.push_back(*it);
-        break;
-      }
-      case Processor::IO_PROC: {
-        local_ios.push_back(*it);
-        break;
-      }
-      case Processor::PY_PROC: {
-        local_pys.push_back(*it);
-        break;
-      }
-      default:
-        break;
-    }
-  }
-  // Now do queries to find all our local memories
-  Machine::MemoryQuery local_sysmem(machine);
-  local_sysmem.local_address_space();
-  local_sysmem.only_kind(Memory::SYSTEM_MEM);
-  assert(local_sysmem.count() > 0);
-  local_system_memory = local_sysmem.first();
-  if (!local_gpus.empty()) {
-    Machine::MemoryQuery local_zcmem(machine);
-    local_zcmem.local_address_space();
-    local_zcmem.only_kind(Memory::Z_COPY_MEM);
-    assert(local_zcmem.count() > 0);
-    local_zerocopy_memory = local_zcmem.first();
-  }
-  for (std::vector<Processor>::const_iterator it = local_gpus.begin();
-       it != local_gpus.end(); it++) {
-    Machine::MemoryQuery local_framebuffer(machine);
-    local_framebuffer.local_address_space();
-    local_framebuffer.only_kind(Memory::GPU_FB_MEM);
-    local_framebuffer.best_affinity_to(*it);
-    assert(local_framebuffer.count() > 0);
-    local_frame_buffers[*it] = local_framebuffer.first();
-  }
-  for (std::vector<Processor>::const_iterator it = local_omps.begin();
-       it != local_omps.end(); it++) {
-    Machine::MemoryQuery local_numa(machine);
-    local_numa.local_address_space();
-    local_numa.only_kind(Memory::SOCKET_MEM);
-    local_numa.best_affinity_to(*it);
-    if (local_numa.count() > 0)  // if we have NUMA memories then use them
-      local_numa_domains[*it] = local_numa.first();
-    else  // Otherwise we just use the local system memory
-      local_numa_domains[*it] = local_system_memory;
-  }
-}
-
-StrategyMapper::~StrategyMapper(void)
-{
-  free(const_cast<char*>(mapper_name));
-}
-
-//--------------------------------------------------------------------------
-/*static*/ AddressSpace
-StrategyMapper::get_local_node(void)
-//--------------------------------------------------------------------------
-{
-  Processor p = Processor::get_executing_processor();
-  return p.address_space();
-}
-
-//--------------------------------------------------------------------------
-/*static*/ size_t
-StrategyMapper::get_total_nodes(Machine m)
-//--------------------------------------------------------------------------
-{
-  Machine::ProcessorQuery query(m);
-  query.only_kind(Processor::LOC_PROC);
-  std::set<AddressSpace> spaces;
-  for (Machine::ProcessorQuery::iterator it = query.begin(); it != query.end();
-       it++)
-    spaces.insert(it->address_space());
-  return spaces.size();
-}
-
-//--------------------------------------------------------------------------
-/*static*/ const char*
-StrategyMapper::create_name(AddressSpace node)
-//--------------------------------------------------------------------------
-{
-  char buffer[128];
-  snprintf(buffer, 127, "Legion Triton Mapper on Node %d", node);
-  return strdup(buffer);
-}
-
-//--------------------------------------------------------------------------
-const char*
-StrategyMapper::get_mapper_name(void) const
-//--------------------------------------------------------------------------
-{
-  return mapper_name;
-}
-
-//--------------------------------------------------------------------------
-Mapper::MapperSyncModel
-StrategyMapper::get_mapper_sync_model(void) const
-//--------------------------------------------------------------------------
-{
-  return SERIALIZED_REENTRANT_MAPPER_MODEL;
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::select_task_options(
-    const MapperContext ctx, const Task& task, TaskOptions& output)
-//--------------------------------------------------------------------------
-{
-  assert(task.get_depth() > 0);
-  if (!local_gpus.empty() && has_variant(ctx, task, Processor::TOC_PROC))
-    output.initial_proc = local_gpus.front();
-  else if (!local_omps.empty() && has_variant(ctx, task, Processor::OMP_PROC))
-    output.initial_proc = local_omps.front();
-  else
-    output.initial_proc = local_cpus.front();
-  // We never want valid instances
-  output.valid_instances = false;
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::premap_task(
-    const MapperContext ctx, const Task& task, const PremapTaskInput& input,
-    PremapTaskOutput& output)
-//--------------------------------------------------------------------------
-{
-  // NO-op since we know that all our futures should be mapped in the system
-  // memory
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::slice_task(
-    const MapperContext ctx, const Task& task, const SliceTaskInput& input,
-    SliceTaskOutput& output)
-//--------------------------------------------------------------------------
-{
-  // For multi-node cases we should already have been sharded so we
-  // should just have one or a few points here on this node, so iterate
-  // them and round-robin them across the local processors here
-  output.slices.reserve(input.domain.get_volume());
-  // Get the sharding functor for this operation and then use it to localize
-  // the points onto the processors of this shard
-  ShardingFunction* function = find_sharding_functor(task);
-  // Get the domain for the sharding space also
-  Domain sharding_domain = task.index_domain;
-  if (task.sharding_space.exists())
-    sharding_domain = runtime->get_index_space_domain(ctx, task.sharding_space);
-  switch (function->strategy->kind) {
-    case Processor::LOC_PROC: {
-      for (Domain::DomainPointIterator itr(input.domain); itr; itr++) {
-        const Processor proc = function->find_proc(itr.p, sharding_domain);
-        assert(proc.kind() == Processor::LOC_PROC);
-        output.slices.push_back(TaskSlice(
-            Domain(itr.p, itr.p), proc, false /*recurse*/,
-            false /*stealable*/));
-      }
-      break;
-    }
-    case Processor::TOC_PROC: {
-      for (Domain::DomainPointIterator itr(input.domain); itr; itr++) {
-        const Processor proc = function->find_proc(itr.p, sharding_domain);
-        assert(proc.kind() == Processor::TOC_PROC);
-        output.slices.push_back(TaskSlice(
-            Domain(itr.p, itr.p), proc, false /*recurse*/,
-            false /*stealable*/));
-      }
-      break;
-    }
-    case Processor::OMP_PROC: {
-      for (Domain::DomainPointIterator itr(input.domain); itr; itr++) {
-        const Processor proc = function->find_proc(itr.p, sharding_domain);
-        assert(proc.kind() == Processor::OMP_PROC);
-        output.slices.push_back(TaskSlice(
-            Domain(itr.p, itr.p), proc, false /*recurse*/,
-            false /*stealable*/));
-      }
-      break;
-    }
-    default:
-      abort();
-  }
-}
-
-//--------------------------------------------------------------------------
-bool
-StrategyMapper::has_variant(
-    const MapperContext ctx, const Task& task, Processor::Kind kind)
-//--------------------------------------------------------------------------
-{
-  const std::pair<TaskID, Processor::Kind> key(task.task_id, kind);
-  // Check to see if we already have it
-  std::map<std::pair<TaskID, Processor::Kind>, VariantID>::const_iterator
-      finder = used_variants.find(key);
-  if ((finder != used_variants.end()) && (finder->second != 0))
-    return true;
-  std::vector<VariantID> variants;
-  runtime->find_valid_variants(ctx, key.first, variants, key.second);
-  assert(variants.size() <= 1);
-  if (variants.empty())
-    return false;
-  used_variants[key] = variants.front();
-  return true;
-}
-
-//--------------------------------------------------------------------------
-VariantID
-StrategyMapper::find_variant(const MapperContext ctx, const Task& task)
-//--------------------------------------------------------------------------
-{
-  return find_variant(ctx, task, task.target_proc);
-}
-
-//--------------------------------------------------------------------------
-VariantID
-StrategyMapper::find_variant(
-    const MapperContext ctx, const Task& task, Processor target_proc)
-//--------------------------------------------------------------------------
-{
-  const std::pair<TaskID, Processor::Kind> key(
-      task.task_id, target_proc.kind());
-  std::map<std::pair<TaskID, Processor::Kind>, VariantID>::const_iterator
-      finder = used_variants.find(key);
-  if ((finder != used_variants.end()) && (finder->second != 0))
-    return finder->second;
-  // Haven't seen it before so let's look it up to make sure it exists
-  std::vector<VariantID> variants;
-  runtime->find_valid_variants(ctx, key.first, variants, key.second);
-  assert(variants.size() <= 1);
-  if (variants.empty())
-    log_triton.error(
-        "Unable to find variant for task %s to run on processor %llx.",
-        task.get_task_name(), target_proc.id);
-  VariantID result = variants.front();
-  used_variants[key] = result;
-  return result;
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::map_task(
-    const MapperContext ctx, const Task& task, const MapTaskInput& input,
-    MapTaskOutput& output)
-//--------------------------------------------------------------------------
-{
-  // Should never be mapping the top-level task here
-  assert(task.get_depth() > 0);
-  // This is one of our normal operator tasks
-  // First let's see if this is sub-rankable
-  output.chosen_instances.resize(task.regions.size());
-  output.chosen_variant = find_variant(ctx, task);
-  // Normal task and not sub-rankable, so let's actually do the mapping
-  Memory target_memory = Memory::NO_MEMORY;
-  switch (task.target_proc.kind()) {
-    case Processor::LOC_PROC: {
-      target_memory = local_system_memory;
-      break;
-    }
-    case Processor::TOC_PROC: {
-      target_memory = local_frame_buffers[task.target_proc];
-      break;
-    }
-    case Processor::OMP_PROC: {
-      target_memory = local_numa_domains[task.target_proc];
-      break;
-    }
-    default:
-      abort();
-  }
-  // Map each field separately for each of the logical regions
-  std::vector<PhysicalInstance> needed_acquires;
-  for (unsigned idx = 0; idx < task.regions.size(); idx++) {
-    const RegionRequirement& req = task.regions[idx];
-    // Skip any regions that have been projected out
-    if (!req.region.exists())
-      continue;
-    std::vector<PhysicalInstance>& instances = output.chosen_instances[idx];
-    // Get the reference to our valid instances in case we decide to use them
-    const std::vector<PhysicalInstance>& valid = input.valid_instances[idx];
-    instances.resize(req.privilege_fields.size());
-    unsigned index = 0;
-    for (std::set<FieldID>::const_iterator it = req.privilege_fields.begin();
-         it != req.privilege_fields.end(); it++, index++)
-      if (map_tensor(
-              ctx, task, idx, req.region, *it, target_memory, task.target_proc,
-              valid, instances[index], req.redop))
-        needed_acquires.push_back(instances[index]);
-  }
-  // Do an acquire on all the instances so we have our result
-  // Keep doing this until we succed or we get an out of memory error
-  while (!needed_acquires.empty() &&
-         !runtime->acquire_and_filter_instances(
-             ctx, needed_acquires, true /*filter on acquire*/)) {
-    assert(!needed_acquires.empty());
-    // If we failed to acquire any of the instances we need to prune them
-    // out of the mapper's data structure so do that first
-    std::set<PhysicalInstance> failed_acquires;
-    filter_failed_acquires(needed_acquires, failed_acquires);
-    // Now go through all our region requirements and and figure out which
-    // region requirements and fields need to attempt to remap
-    for (unsigned idx1 = 0; idx1 < task.regions.size(); idx1++) {
-      const RegionRequirement& req = task.regions[idx1];
-      // Skip any regions that have been projected out
-      if (!req.region.exists())
-        continue;
-      std::vector<PhysicalInstance>& instances = output.chosen_instances[idx1];
-      std::set<FieldID>::const_iterator fit = req.privilege_fields.begin();
-      for (unsigned idx2 = 0; idx2 < instances.size(); idx2++, fit++) {
-        if (failed_acquires.find(instances[idx2]) == failed_acquires.end())
-          continue;
-        // Now try to remap it
-        const FieldID fid = *fit;
-        const std::vector<PhysicalInstance>& valid =
-            input.valid_instances[idx1];
-        if (map_tensor(
-                ctx, task, idx1, req.region, fid, target_memory,
-                task.target_proc, valid, instances[idx2], req.redop))
-          needed_acquires.push_back(instances[idx2]);
-      }
-    }
-  }
-  // Just put our target proc in the target processors for now
-  output.target_procs.push_back(task.target_proc);
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::map_replicate_task(
-    const MapperContext ctx, const Task& task, const MapTaskInput& input,
-    const MapTaskOutput& def_output, MapReplicateTaskOutput& output)
-//--------------------------------------------------------------------------
-{
-  abort();
-}
-
-//--------------------------------------------------------------------------
-bool
-StrategyMapper::find_existing_instance(
-    LogicalRegion region, FieldID fid, Memory target_memory,
-    PhysicalInstance& result)
-//--------------------------------------------------------------------------
-{
-  // See if we already have it in our local instances
-  const FieldMemInfo info(region.get_tree_id(), fid, target_memory);
-  std::map<FieldMemInfo, InstanceInfos>::const_iterator finder =
-      local_instances.find(info);
-  if ((finder != local_instances.end()) &&
-      finder->second.has_instance(region, result))
-    return true;
-  // See if we can find an existing instance in any memory
-  const FieldMemInfo info_sysmem(
-      region.get_tree_id(), fid, local_system_memory);
-  finder = local_instances.find(info_sysmem);
-  if ((finder != local_instances.end()) &&
-      finder->second.has_instance(region, result)) {
-    return true;
-  }
-  for (std::map<Processor, Memory>::const_iterator it =
-           local_frame_buffers.begin();
-       it != local_frame_buffers.end(); it++) {
-    const FieldMemInfo info_fb(region.get_tree_id(), fid, it->second);
-    finder = local_instances.find(info_fb);
-    if ((finder != local_instances.end()) &&
-        finder->second.has_instance(region, result)) {
-      return true;
-    }
-  }
-  for (std::map<Processor, Memory>::const_iterator it =
-           local_numa_domains.begin();
-       it != local_numa_domains.end(); it++) {
-    const FieldMemInfo info_numa(region.get_tree_id(), fid, it->second);
-    finder = local_instances.find(info_numa);
-    if ((finder != local_instances.end()) &&
-        finder->second.has_instance(region, result)) {
-      return true;
-    }
-  }
-  return false;
-}
-
-//--------------------------------------------------------------------------
-bool
-StrategyMapper::map_tensor(
-    const MapperContext ctx, const Mappable& mappable, unsigned index,
-    LogicalRegion region, FieldID fid, Memory target_memory,
-    Processor target_proc, const std::vector<PhysicalInstance>& valid,
-    PhysicalInstance& result, ReductionOpID redop /*=0*/)
-//--------------------------------------------------------------------------
-{
-  // If we're making a reduction instance, we should just make it now
-  if (redop != 0) {
-    // Switch the target memory if we're going to a GPU because
-    // Realm's DMA system still does not support reductions
-    if (target_memory.kind() == Memory::GPU_FB_MEM)
-      target_memory = local_zerocopy_memory;
-    const std::vector<LogicalRegion> regions(1, region);
-    LayoutConstraintSet layout_constraints;
-    // No specialization
-    layout_constraints.add_constraint(
-        SpecializedConstraint(REDUCTION_FOLD_SPECIALIZE, redop));
-    // SOA-C dimension ordering
-    std::vector<DimensionKind> dimension_ordering(4);
-    dimension_ordering[0] = DIM_Z;
-    dimension_ordering[1] = DIM_Y;
-    dimension_ordering[2] = DIM_X;
-    dimension_ordering[3] = DIM_F;
-    layout_constraints.add_constraint(
-        OrderingConstraint(dimension_ordering, false /*contiguous*/));
-    // Constraint for the kind of memory
-    layout_constraints.add_constraint(MemoryConstraint(target_memory.kind()));
-    // Make sure we have our field
-    const std::vector<FieldID> fields(1, fid);
-    layout_constraints.add_constraint(
-        FieldConstraint(fields, true /*contiguous*/));
-    if (!runtime->create_physical_instance(
-            ctx, target_memory, layout_constraints, regions, result,
-            true /*acquire*/))
-      report_failed_mapping(mappable, index, target_memory, redop);
-    // We already did the acquire
-    return false;
-  }
-  // See if we already have it in our local instances
-  const FieldMemInfo info_key(region.get_tree_id(), fid, target_memory);
-  std::map<FieldMemInfo, InstanceInfos>::const_iterator finder =
-      local_instances.find(info_key);
-  if ((finder != local_instances.end()) &&
-      finder->second.has_instance(region, result)) {
-    // Needs acquire to keep the runtime happy
-    return true;
-  }
-  // There's a little asymmetry here between CPUs and GPUs for NUMA effects
-  // For CPUs NUMA-effects are within a factor of 2X additional latency and
-  // reduced bandwidth, so it's better to just use data where it is rather
-  // than move it. For GPUs though, the difference between local framebuffer
-  // and remote can be on the order of 800 GB/s versus 20 GB/s over NVLink
-  // so it's better to move things local, so we'll always try to make a local
-  // instance before checking for a nearby instance in a different GPU.
-  if (target_proc.exists() && ((target_proc.kind() == Processor::LOC_PROC) ||
-                               (target_proc.kind() == Processor::OMP_PROC))) {
-    Machine::MemoryQuery affinity_mems(machine);
-    affinity_mems.has_affinity_to(target_proc);
-    for (Machine::MemoryQuery::iterator it = affinity_mems.begin();
-         it != affinity_mems.end(); it++) {
-      const FieldMemInfo affinity_info(region.get_tree_id(), fid, *it);
-      finder = local_instances.find(affinity_info);
-      if ((finder != local_instances.end()) &&
-          finder->second.has_instance(region, result))
-        // Needs acquire to keep the runtime happy
-        return true;
-    }
-  }
-  // Haven't made this instance before, so make it now
-  // We can do an interesting optimization here to try to reduce unnecessary
-  // inter-memory copies. For logical regions that are overlapping we try
-  // to accumulate as many as possible into one physical instance and use
-  // that instance for all the tasks for the different regions.
-  // First we have to see if there is anything we overlap with
-  const IndexSpace is = region.get_index_space();
-  // This whole process has to appear atomic
-  runtime->disable_reentrant(ctx);
-  InstanceInfos& infos = local_instances[info_key];
-  // One more check once we get the lock
-  if (infos.has_instance(region, result)) {
-    runtime->enable_reentrant(ctx);
-    return true;
-  }
-  const Domain dom = runtime->get_index_space_domain(ctx, is);
-  std::vector<unsigned> overlaps;
-  // Regions to include in the overlap from other fields
-  std::set<LogicalRegion> other_field_overlaps;
-  // This is guaranteed to be a rectangle
-  Domain upper_bound;
-  switch (is.get_dim()) {
-#define DIMFUNC(DN)                                                          \
-  case DN: {                                                                 \
-    bool changed = false;                                                    \
-    Rect<DN> bound = dom.bounds<DN, coord_t>();                              \
-    for (unsigned idx = 0; idx < infos.instances.size(); idx++) {            \
-      const InstanceInfo& info = infos.instances[idx];                       \
-      Rect<DN> other = info.bounding_box;                                    \
-      Rect<DN> intersect = bound.intersection(other);                        \
-      if (intersect.empty())                                                 \
-        continue;                                                            \
-      /*Don't merge if the unused space would be more than the space saved*/ \
-      Rect<DN> union_bbox = bound.union_bbox(other);                         \
-      size_t bound_volume = bound.volume();                                  \
-      size_t union_volume = union_bbox.volume();                             \
-      /* If it didn't get any bigger then we can keep going*/                \
-      if (bound_volume == union_volume)                                      \
-        continue;                                                            \
-      size_t intersect_volume = intersect.volume();                          \
-      /* Only allow merging if it isn't "too big"*/                          \
-      /* We define "too big" as the size of the "unused" points being bigger \
-       * than the intersection*/                                             \
-      if ((union_volume - (bound_volume + other.volume() -                   \
-                           intersect_volume)) > intersect_volume)            \
-        continue;                                                            \
-      overlaps.push_back(idx);                                               \
-      bound = union_bbox;                                                    \
-      changed = true;                                                        \
-    }                                                                        \
-    /* If we didn't find any overlapping modifications check adjacent fields \
-     * in the same tree*/                                                    \
-    /* to see if we can use them to infer what our shape should be.*/        \
-    if (!changed) {                                                          \
-      for (std::map<FieldMemInfo, InstanceInfos>::const_iterator it =        \
-               local_instances.begin();                                      \
-           it != local_instances.end(); it++) {                              \
-        if ((it->first.tid != info_key.tid) ||                               \
-            (it->first.fid == info_key.fid) ||                               \
-            (it->first.memory != info_key.memory))                           \
-          continue;                                                          \
-        std::map<LogicalRegion, unsigned>::const_iterator finder =           \
-            it->second.region_mapping.find(region);                          \
-        if (finder != it->second.region_mapping.end()) {                     \
-          const InstanceInfo& other_info =                                   \
-              it->second.instances[finder->second];                          \
-          Rect<DN> other = other_info.bounding_box;                          \
-          bound = bound.union_bbox(other);                                   \
-          other_field_overlaps.insert(                                       \
-              other_info.regions.begin(), other_info.regions.end());         \
-        }                                                                    \
-      }                                                                      \
-    }                                                                        \
-    upper_bound = Domain(bound);                                             \
-    break;                                                                   \
-  }
-    LEGION_FOREACH_N(DIMFUNC)
-#undef DIMFUNC
-    default:
-      assert(false);
-  }
-  // We're going to need some of this constraint information no matter
-  // which path we end up taking below
-  LayoutConstraintSet layout_constraints;
-  // No specialization
-  layout_constraints.add_constraint(SpecializedConstraint());
-  // SOA-C dimension ordering
-  std::vector<DimensionKind> dimension_ordering(4);
-  dimension_ordering[0] = DIM_Z;
-  dimension_ordering[1] = DIM_Y;
-  dimension_ordering[2] = DIM_X;
-  dimension_ordering[3] = DIM_F;
-  layout_constraints.add_constraint(
-      OrderingConstraint(dimension_ordering, false /*contiguous*/));
-  // Constraint for the kind of memory
-  layout_constraints.add_constraint(MemoryConstraint(target_memory.kind()));
-  // Make sure we have our field
-  const std::vector<FieldID> fields(1, fid);
-  layout_constraints.add_constraint(
-      FieldConstraint(fields, true /*contiguous*/));
-  // Check to see if we have any overlaps
-  if (overlaps.empty()) {
-    // No overlaps, so just go ahead and make our instance and add it
-    std::vector<LogicalRegion> regions(1, region);
-    // If we're bringing in other regions include them as well in this set
-    if (!other_field_overlaps.empty()) {
-      other_field_overlaps.erase(region);
-      regions.insert(
-          regions.end(), other_field_overlaps.begin(),
-          other_field_overlaps.end());
-    }
-    bool created;
-    size_t footprint;
-    if (runtime->find_or_create_physical_instance(
-            ctx, target_memory, layout_constraints, regions, result, created,
-            true /*acquire*/, GC_NEVER_PRIORITY, false /*tight bounds*/,
-            &footprint)) {
-      // We succeeded in making the instance where we want it
-      assert(result.exists());
-      if (created)
-        log_triton.info(
-            "%s created instance %lx containing %zd bytes in memory " IDFMT,
-            mapper_name, result.get_instance_id(), footprint, target_memory.id);
-      // Only save the result for future use if it is not an external instance
-      if (!result.is_external_instance()) {
-        const unsigned idx = infos.insert(region, upper_bound, result);
-        InstanceInfo& info = infos.instances[idx];
-        for (std::set<LogicalRegion>::const_iterator it =
-                 other_field_overlaps.begin();
-             it != other_field_overlaps.end(); it++) {
-          if ((*it) == region)
-            continue;
-          infos.region_mapping[*it] = idx;
-          info.regions.push_back(*it);
-        }
-      }
-      // We made it so no need for an acquire
-      runtime->enable_reentrant(ctx);
-      return false;
-    }
-
-  } else if (overlaps.size() == 1) {
-    // Overlap with exactly one other instance
-    InstanceInfo& info = infos.instances[overlaps[0]];
-    // A Legion bug prevents us from doing this case
-    if (info.bounding_box == upper_bound) {
-      // Easy case of dominance, so just add it
-      info.regions.push_back(region);
-      infos.region_mapping[region] = overlaps[0];
-      result = info.instance;
-      runtime->enable_reentrant(ctx);
-      // Didn't make it so we need to acquire it
-      return true;
-    } else {
-      // We have to make a new instance
-      info.regions.push_back(region);
-      bool created;
-      size_t footprint;
-      if (runtime->find_or_create_physical_instance(
-              ctx, target_memory, layout_constraints, info.regions, result,
-              created, true /*acquire*/, GC_NEVER_PRIORITY,
-              false /*tight bounds*/, &footprint)) {
-        // We succeeded in making the instance where we want it
-        assert(result.exists());
-        if (created)
-          log_triton.info(
-              "%s created instance %lx containing %zd bytes in memory " IDFMT,
-              mapper_name, result.get_instance_id(), footprint,
-              target_memory.id);
-        // Remove the GC priority on the old instance back to 0
-        runtime->set_garbage_collection_priority(ctx, info.instance, 0);
-        // Update everything in place
-        info.instance = result;
-        info.bounding_box = upper_bound;
-        infos.region_mapping[region] = overlaps[0];
-        runtime->enable_reentrant(ctx);
-        // We made it so no need for an acquire
-        return false;
-      } else  // Failed to make it so pop the logical region name back off
-        info.regions.pop_back();
-    }
-  } else {
-    // Overlap with multiple previous instances
-    std::vector<LogicalRegion> combined_regions(1, region);
-    for (std::vector<unsigned>::const_iterator it = overlaps.begin();
-         it != overlaps.end(); it++)
-      combined_regions.insert(
-          combined_regions.end(), infos.instances[*it].regions.begin(),
-          infos.instances[*it].regions.end());
-    // Try to make it
-    bool created;
-    size_t footprint;
-    if (runtime->find_or_create_physical_instance(
-            ctx, target_memory, layout_constraints, combined_regions, result,
-            created, true /*acquire*/, GC_NEVER_PRIORITY,
-            false /*tight bounds*/, &footprint)) {
-      // We succeeded in making the instance where we want it
-      assert(result.exists());
-      if (created)
-        log_triton.info(
-            "%s created instance %lx containing %zd bytes in memory " IDFMT,
-            mapper_name, result.get_instance_id(), footprint, target_memory.id);
-      // Remove all the previous entries back to front
-      for (std::vector<unsigned>::const_reverse_iterator it =
-               overlaps.crbegin();
-           it != overlaps.crend(); it++) {
-        // Remove the GC priority on the old instance
-        runtime->set_garbage_collection_priority(
-            ctx, infos.instances[*it].instance, 0);
-        infos.instances.erase(infos.instances.begin() + *it);
-      }
-      // Add the new entry
-      const unsigned index = infos.instances.size();
-      infos.instances.resize(index + 1);
-      InstanceInfo& info = infos.instances[index];
-      info.instance = result;
-      info.bounding_box = upper_bound;
-      info.regions = combined_regions;
-      // Update the mappings for all the instances
-      // This really sucks but it should be pretty rare
-      // We can start at the entry of the first overlap since everything
-      // before that is guaranteed to be unchanged
-      for (unsigned idx = overlaps[0]; idx < infos.instances.size(); idx++) {
-        for (std::vector<LogicalRegion>::const_iterator it =
-                 infos.instances[idx].regions.begin();
-             it != infos.instances[idx].regions.end(); it++)
-          infos.region_mapping[*it] = idx;
-      }
-      runtime->enable_reentrant(ctx);
-      // We made it so no need for an acquire
-      return false;
-    }
-  }
-  // Done with the atomic part
-  runtime->enable_reentrant(ctx);
-  // If we get here it's because we failed to make the instance, we still
-  // have a few more tricks that we can try
-  // First see if we can find an existing valid instance that we can use
-  // with affinity to our target processor
-  if (!valid.empty()) {
-    for (std::vector<PhysicalInstance>::const_iterator it = valid.begin();
-         it != valid.end(); it++) {
-      // If it doesn't have the field then we don't care
-      if (!it->has_field(fid))
-        continue;
-      if (!target_proc.exists() ||
-          machine.has_affinity(target_proc, it->get_location())) {
-        result = *it;
-        return true;
-      }
-    }
-  }
-  // Still couldn't find an instance, see if we can find any instances
-  // in memories that are local to our node that we can use
-  if (target_proc.exists()) {
-    Machine::MemoryQuery affinity_mems(machine);
-    affinity_mems.has_affinity_to(target_proc);
-    for (Machine::MemoryQuery::iterator it = affinity_mems.begin();
-         it != affinity_mems.end(); it++) {
-      const FieldMemInfo affinity_info(region.get_tree_id(), fid, *it);
-      finder = local_instances.find(affinity_info);
-      if ((finder != local_instances.end()) &&
-          finder->second.has_instance(region, result))
-        // Needs acquire to keep the runtime happy
-        return true;
-    }
-  } else if (find_existing_instance(region, fid, target_memory, result)) {
-    return true;
-  }
-  // If we make it here then we failed entirely
-  report_failed_mapping(mappable, index, target_memory, redop);
-  return true;
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::filter_failed_acquires(
-    std::vector<PhysicalInstance>& needed_acquires,
-    std::set<PhysicalInstance>& failed_acquires)
-//--------------------------------------------------------------------------
-{
-  for (std::vector<PhysicalInstance>::const_iterator it =
-           needed_acquires.begin();
-       it != needed_acquires.end(); it++) {
-    if (failed_acquires.find(*it) != failed_acquires.end())
-      continue;
-    failed_acquires.insert(*it);
-    const Memory mem = it->get_location();
-    const RegionTreeID tid = it->get_tree_id();
-    for (std::map<FieldMemInfo, InstanceInfos>::iterator fit =
-             local_instances.begin();
-         fit != local_instances.end();
-         /*nothing*/) {
-      if ((fit->first.memory != mem) || (fit->first.tid != tid)) {
-        fit++;
-        continue;
-      }
-      if (fit->second.filter(*it)) {
-        std::map<FieldMemInfo, InstanceInfos>::iterator to_delete = fit++;
-        local_instances.erase(to_delete);
-      } else
-        fit++;
-    }
-  }
-  needed_acquires.clear();
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::report_failed_mapping(
-    const Mappable& mappable, unsigned index, Memory target_memory,
-    ReductionOpID redop)
-//--------------------------------------------------------------------------
-{
-  const char* memory_kinds[] = {
-#define MEM_NAMES(name, desc) desc,
-      REALM_MEMORY_KINDS(MEM_NAMES)
-#undef MEM_NAMES
-  };
-  switch (mappable.get_mappable_type()) {
-    case Mappable::TASK_MAPPABLE: {
-      const Task* task = mappable.as_task();
-      if (redop > 0)
-        log_triton.error(
-            "Mapper %s failed to map reduction (%d) region "
-            "requirement %d of task %s (UID %lld) into %s memory " IDFMT,
-            get_mapper_name(), redop, index, task->get_task_name(),
-            mappable.get_unique_id(), memory_kinds[target_memory.kind()],
-            target_memory.id);
-      else
-        log_triton.error(
-            "Mapper %s failed to map region requirement %d of "
-            "task %s (UID %lld) into %s memory " IDFMT,
-            get_mapper_name(), index, task->get_task_name(),
-            mappable.get_unique_id(), memory_kinds[target_memory.kind()],
-            target_memory.id);
-      break;
-    }
-    case Mappable::COPY_MAPPABLE: {
-      if (redop > 0)
-        log_triton.error(
-            "Mapper %s failed to map reduction (%d) region "
-            "requirement %d of copy (UID %lld) into %s memory " IDFMT,
-            get_mapper_name(), redop, index, mappable.get_unique_id(),
-            memory_kinds[target_memory.kind()], target_memory.id);
-      else
-        log_triton.error(
-            "Mapper %s failed to map region requirement %d of "
-            "copy (UID %lld) into %s memory " IDFMT,
-            get_mapper_name(), index, mappable.get_unique_id(),
-            memory_kinds[target_memory.kind()], target_memory.id);
-      break;
-    }
-    case Mappable::INLINE_MAPPABLE: {
-      if (redop > 0)
-        log_triton.error(
-            "Mapper %s failed to map reduction (%d) region "
-            "requirement %d of inline mapping (UID %lld) into %s memory " IDFMT,
-            get_mapper_name(), redop, index, mappable.get_unique_id(),
-            memory_kinds[target_memory.kind()], target_memory.id);
-      else
-        log_triton.error(
-            "Mapper %s failed to map region requirement %d of "
-            "inline mapping (UID %lld) into %s memory " IDFMT,
-            get_mapper_name(), index, mappable.get_unique_id(),
-            memory_kinds[target_memory.kind()], target_memory.id);
-      break;
-    }
-    case Mappable::PARTITION_MAPPABLE: {
-      assert(redop == 0);
-      log_triton.error(
-          "Mapper %s failed to map region requirement %d of "
-          "partition (UID %lld) into %s memory " IDFMT,
-          get_mapper_name(), index, mappable.get_unique_id(),
-          memory_kinds[target_memory.kind()], target_memory.id);
-      break;
-    }
-    default:
-      abort();  // should never get here
-  }
-  abort();
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::select_task_variant(
-    const MapperContext ctx, const Task& task, const SelectVariantInput& input,
-    SelectVariantOutput& output)
-//--------------------------------------------------------------------------
-{
-  output.chosen_variant = find_variant(ctx, task, input.processor);
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::postmap_task(
-    const MapperContext ctx, const Task& task, const PostMapInput& input,
-    PostMapOutput& output)
-//--------------------------------------------------------------------------
-{
-  // We should currently never get this call in triton
-  abort();
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::select_task_sources(
-    const MapperContext ctx, const Task& task, const SelectTaskSrcInput& input,
-    SelectTaskSrcOutput& output)
-//--------------------------------------------------------------------------
-{
-  triton_select_sources(
-      ctx, input.target, input.source_instances, output.chosen_ranking);
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::triton_select_sources(
-    const MapperContext ctx, const PhysicalInstance& target,
-    const std::vector<PhysicalInstance>& sources,
-    std::deque<PhysicalInstance>& ranking)
-//--------------------------------------------------------------------------
-{
-  std::map<Memory, unsigned /*bandwidth*/> source_memories;
-  // For right now we'll rank instances by the bandwidth of the memory
-  // they are in to the destination, we'll only rank sources from the
-  // local node if there are any
-  bool all_local = false;
-  // TODO: consider layouts when ranking source to help out the DMA system
-  Memory destination_memory = target.get_location();
-  std::vector<MemoryMemoryAffinity> affinity(1);
-  // fill in a vector of the sources with their bandwidths and sort them
-  std::vector<std::pair<PhysicalInstance, unsigned /*bandwidth*/>> band_ranking;
-  for (unsigned idx = 0; idx < sources.size(); idx++) {
-    const PhysicalInstance& instance = sources[idx];
-    Memory location = instance.get_location();
-    if (location.address_space() == local_node) {
-      if (!all_local) {
-        source_memories.clear();
-        band_ranking.clear();
-        all_local = true;
-      }
-    } else if (all_local)  // Skip any remote instances once we're local
-      continue;
-    std::map<Memory, unsigned>::const_iterator finder =
-        source_memories.find(location);
-    if (finder == source_memories.end()) {
-      affinity.clear();
-      machine.get_mem_mem_affinity(
-          affinity, location, destination_memory,
-          false /*not just local affinities*/);
-      unsigned memory_bandwidth = 0;
-      if (!affinity.empty()) {
-        assert(affinity.size() == 1);
-        memory_bandwidth = affinity[0].bandwidth;
-      }
-      source_memories[location] = memory_bandwidth;
-      band_ranking.push_back(
-          std::pair<PhysicalInstance, unsigned>(instance, memory_bandwidth));
-    } else
-      band_ranking.push_back(
-          std::pair<PhysicalInstance, unsigned>(instance, finder->second));
-  }
-  assert(!band_ranking.empty());
-  // Easy case of only one instance
-  if (band_ranking.size() == 1) {
-    ranking.push_back(band_ranking.begin()->first);
-    return;
-  }
-  // Sort them by bandwidth
-  std::sort(band_ranking.begin(), band_ranking.end(), physical_sort_func);
-  // Iterate from largest bandwidth to smallest
-  for (std::vector<
-           std::pair<PhysicalInstance, unsigned>>::const_reverse_iterator it =
-           band_ranking.rbegin();
-       it != band_ranking.rend(); it++)
-    ranking.push_back(it->first);
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::speculate(
-    const MapperContext ctx, const Task& task, SpeculativeOutput& output)
-//--------------------------------------------------------------------------
-{
-  output.speculate = false;
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::report_profiling(
-    const MapperContext ctx, const Task& task, const TaskProfilingInfo& input)
-//--------------------------------------------------------------------------
-{
-  // Shouldn't get any profiling feedback currently
-  abort();
-}
-
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::select_sharding_functor(
-    const MapperContext ctx, const Task& task,
-    const SelectShardingFunctorInput& input,
-    SelectShardingFunctorOutput& output)
-//--------------------------------------------------------------------------
-{
-  output.chosen_functor = find_sharding_functor(task)->sharding_id;
-}
-
-//--------------------------------------------------------------------------
-ShardingFunction*
-StrategyMapper::find_sharding_functor(const Mappable& mappable)
-//--------------------------------------------------------------------------
-{
-  assert(mappable.tag < strategy->layers.size());
-  return strategy->layers[mappable.tag]->sharding_function;
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::map_inline(
-    const MapperContext ctx, const InlineMapping& inline_op,
-    const MapInlineInput& input, MapInlineOutput& output)
-//--------------------------------------------------------------------------
-{
-  const std::vector<PhysicalInstance>& valid = input.valid_instances;
-  const RegionRequirement& req = inline_op.requirement;
-  output.chosen_instances.resize(req.privilege_fields.size());
-  unsigned index = 0;
-  std::vector<PhysicalInstance> needed_acquires;
-  for (std::set<FieldID>::const_iterator it = req.privilege_fields.begin();
-       it != req.privilege_fields.end(); it++, index++)
-    if (map_tensor(
-            ctx, inline_op, 0, req.region, *it, local_system_memory,
-            inline_op.parent_task->current_proc, valid,
-            output.chosen_instances[index], req.redop))
-      needed_acquires.push_back(output.chosen_instances[index]);
-  while (!needed_acquires.empty() &&
-         !runtime->acquire_and_filter_instances(
-             ctx, needed_acquires, true /*filter on acquire*/)) {
-    assert(!needed_acquires.empty());
-    std::set<PhysicalInstance> failed_instances;
-    filter_failed_acquires(needed_acquires, failed_instances);
-    // Now go through all the fields for the instances and try and remap
-    std::set<FieldID>::const_iterator fit = req.privilege_fields.begin();
-    for (unsigned idx = 0; idx < output.chosen_instances.size(); idx++, fit++) {
-      if (failed_instances.find(output.chosen_instances[idx]) ==
-          failed_instances.end())
-        continue;
-      // Now try to remap it
-      if (map_tensor(
-              ctx, inline_op, 0 /*idx*/, req.region, *fit, local_system_memory,
-              inline_op.parent_task->current_proc, valid,
-              output.chosen_instances[idx]))
-        needed_acquires.push_back(output.chosen_instances[idx]);
-    }
-  }
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::select_inline_sources(
-    const MapperContext ctx, const InlineMapping& inline_op,
-    const SelectInlineSrcInput& input, SelectInlineSrcOutput& output)
-//--------------------------------------------------------------------------
-{
-  triton_select_sources(
-      ctx, input.target, input.source_instances, output.chosen_ranking);
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::report_profiling(
-    const MapperContext ctx, const InlineMapping& inline_op,
-    const InlineProfilingInfo& input)
-//--------------------------------------------------------------------------
-{
-  // No profiling yet for inline mappings
-  abort();
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::map_copy(
-    const MapperContext ctx, const Copy& copy, const MapCopyInput& input,
-    MapCopyOutput& output)
-//--------------------------------------------------------------------------
-{
-  // We should always be able to materialize instances of the things
-  // we are copying so make concrete source instances
-  std::vector<PhysicalInstance> needed_acquires;
-  Memory target_memory = local_system_memory;
-  if (copy.is_index_space) {
-    // If we've got GPUs, assume we're using them
-    if (!local_gpus.empty() || !local_omps.empty()) {
-      ShardingFunction* function = find_sharding_functor(copy);
-      const Processor proc =
-          function->find_proc(copy.index_point, copy.index_domain);
-      assert(
-          (proc.kind() == Processor::OMP_PROC) ||
-          (proc.kind() == Processor::TOC_PROC));
-      if (proc.kind() == Processor::OMP_PROC)
-        target_memory = local_numa_domains[proc];
-      else
-        target_memory = local_frame_buffers[proc];
-    }
-  } else {
-    // If we have just one local GPU then let's use it, otherwise punt to CPU
-    // since it's not clear which one we should use
-    if (local_frame_buffers.size() == 1)
-      target_memory = local_frame_buffers.begin()->second;
-  }
-  for (unsigned idx = 0; idx < copy.src_requirements.size(); idx++) {
-    const RegionRequirement& src_req = copy.src_requirements[idx];
-    output.src_instances[idx].resize(src_req.privilege_fields.size());
-    const std::vector<PhysicalInstance>& src_valid = input.src_instances[idx];
-    unsigned fidx = 0;
-    for (std::set<FieldID>::const_iterator it =
-             src_req.privilege_fields.begin();
-         it != src_req.privilege_fields.end(); it++) {
-      if (find_existing_instance(
-              src_req.region, *it, target_memory,
-              output.src_instances[idx][fidx]) ||
-          map_tensor(
-              ctx, copy, idx, src_req.region, *it, target_memory,
-              Processor::NO_PROC, src_valid, output.src_instances[idx][fidx]))
-        needed_acquires.push_back(output.src_instances[idx][fidx]);
-    }
-    const RegionRequirement& dst_req = copy.dst_requirements[idx];
-    output.dst_instances[idx].resize(dst_req.privilege_fields.size());
-    const std::vector<PhysicalInstance>& dst_valid = input.dst_instances[idx];
-    fidx = 0;
-    for (std::set<FieldID>::const_iterator it =
-             dst_req.privilege_fields.begin();
-         it != dst_req.privilege_fields.end(); it++) {
-      if (((dst_req.redop == 0) && find_existing_instance(
-                                       dst_req.region, *it, target_memory,
-                                       output.dst_instances[idx][fidx])) ||
-          map_tensor(
-              ctx, copy, copy.src_requirements.size() + idx, dst_req.region,
-              *it, target_memory, Processor::NO_PROC, dst_valid,
-              output.dst_instances[idx][fidx], dst_req.redop))
-        needed_acquires.push_back(output.dst_instances[idx][fidx]);
-    }
-    if (idx < copy.src_indirect_requirements.size()) {
-      const RegionRequirement& src_idx = copy.src_indirect_requirements[idx];
-      assert(src_idx.privilege_fields.size() == 1);
-      const FieldID fid = *(src_idx.privilege_fields.begin());
-      const std::vector<PhysicalInstance>& idx_valid =
-          input.src_indirect_instances[idx];
-      if (find_existing_instance(
-              src_idx.region, fid, target_memory,
-              output.src_indirect_instances[idx]) ||
-          map_tensor(
-              ctx, copy, idx, src_idx.region, fid, target_memory,
-              Processor::NO_PROC, idx_valid,
-              output.src_indirect_instances[idx]))
-        needed_acquires.push_back(output.src_indirect_instances[idx]);
-    }
-    if (idx < copy.dst_indirect_requirements.size()) {
-      const RegionRequirement& dst_idx = copy.dst_indirect_requirements[idx];
-      assert(dst_idx.privilege_fields.size() == 1);
-      const FieldID fid = *(dst_idx.privilege_fields.begin());
-      const std::vector<PhysicalInstance>& idx_valid =
-          input.dst_indirect_instances[idx];
-      if (find_existing_instance(
-              dst_idx.region, fid, target_memory,
-              output.dst_indirect_instances[idx]) ||
-          map_tensor(
-              ctx, copy, idx, dst_idx.region, fid, target_memory,
-              Processor::NO_PROC, idx_valid,
-              output.dst_indirect_instances[idx]))
-        needed_acquires.push_back(output.dst_indirect_instances[idx]);
-    }
-  }
-  while (!needed_acquires.empty() &&
-         !runtime->acquire_and_filter_instances(
-             ctx, needed_acquires, true /*filter on acquire*/)) {
-    assert(!needed_acquires.empty());
-    // If we failed to acquire any of the instances we need to prune them
-    // out of the mapper's data structure so do that first
-    std::set<PhysicalInstance> failed_acquires;
-    filter_failed_acquires(needed_acquires, failed_acquires);
-    // Now go through and try to remap region requirements with failed
-    // acquisitions
-    for (unsigned idx = 0; idx < copy.src_requirements.size(); idx++) {
-      const RegionRequirement& src_req = copy.src_requirements[idx];
-      const std::vector<PhysicalInstance>& src_valid = input.src_instances[idx];
-      unsigned fidx = 0;
-      for (std::set<FieldID>::const_iterator it =
-               src_req.privilege_fields.begin();
-           it != src_req.privilege_fields.end(); it++) {
-        if (failed_acquires.find(output.src_instances[idx][fidx]) ==
-            failed_acquires.end())
-          continue;
-        if (map_tensor(
-                ctx, copy, idx, src_req.region, *it, target_memory,
-                Processor::NO_PROC, src_valid, output.src_instances[idx][fidx]))
-          needed_acquires.push_back(output.src_instances[idx][fidx]);
-      }
-      const RegionRequirement& dst_req = copy.dst_requirements[idx];
-      output.dst_instances[idx].resize(dst_req.privilege_fields.size());
-      const std::vector<PhysicalInstance>& dst_valid = input.dst_instances[idx];
-      fidx = 0;
-      for (std::set<FieldID>::const_iterator it =
-               dst_req.privilege_fields.begin();
-           it != dst_req.privilege_fields.end(); it++) {
-        if (failed_acquires.find(output.dst_instances[idx][fidx]) ==
-            failed_acquires.end())
-          continue;
-        if (map_tensor(
-                ctx, copy, copy.src_requirements.size() + idx, dst_req.region,
-                *it, target_memory, Processor::NO_PROC, dst_valid,
-                output.dst_instances[idx][fidx], dst_req.redop))
-          needed_acquires.push_back(output.dst_instances[idx][fidx]);
-      }
-      if (idx < copy.src_indirect_requirements.size()) {
-        const RegionRequirement& src_idx = copy.src_indirect_requirements[idx];
-        assert(src_idx.privilege_fields.size() == 1);
-        const FieldID fid = *(src_idx.privilege_fields.begin());
-        const std::vector<PhysicalInstance>& idx_valid =
-            input.src_indirect_instances[idx];
-        if ((failed_acquires.find(output.src_indirect_instances[idx]) !=
-             failed_acquires.end()) &&
-            map_tensor(
-                ctx, copy, idx, src_idx.region, fid, target_memory,
-                Processor::NO_PROC, idx_valid,
-                output.src_indirect_instances[idx]))
-          needed_acquires.push_back(output.src_indirect_instances[idx]);
-      }
-      if (idx < copy.dst_indirect_requirements.size()) {
-        const RegionRequirement& dst_idx = copy.dst_indirect_requirements[idx];
-        assert(dst_idx.privilege_fields.size() == 1);
-        const FieldID fid = *(dst_idx.privilege_fields.begin());
-        const std::vector<PhysicalInstance>& idx_valid =
-            input.dst_indirect_instances[idx];
-        if ((failed_acquires.find(output.dst_indirect_instances[idx]) !=
-             failed_acquires.end()) &&
-            map_tensor(
-                ctx, copy, idx, dst_idx.region, fid, target_memory,
-                Processor::NO_PROC, idx_valid,
-                output.dst_indirect_instances[idx]))
-          needed_acquires.push_back(output.dst_indirect_instances[idx]);
-      }
-    }
-  }
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::select_copy_sources(
-    const MapperContext ctx, const Copy& copy, const SelectCopySrcInput& input,
-    SelectCopySrcOutput& output)
-//--------------------------------------------------------------------------
-{
-  triton_select_sources(
-      ctx, input.target, input.source_instances, output.chosen_ranking);
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::speculate(
-    const MapperContext ctx, const Copy& copy, SpeculativeOutput& output)
-//--------------------------------------------------------------------------
-{
-  output.speculate = false;
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::report_profiling(
-    const MapperContext ctx, const Copy& copy, const CopyProfilingInfo& input)
-//--------------------------------------------------------------------------
-{
-  // No profiling for copies yet
-  abort();
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::select_sharding_functor(
-    const MapperContext ctx, const Copy& copy,
-    const SelectShardingFunctorInput& input,
-    SelectShardingFunctorOutput& output)
-//--------------------------------------------------------------------------
-{
-  output.chosen_functor = find_sharding_functor(copy)->sharding_id;
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::map_close(
-    const MapperContext ctx, const Close& close, const MapCloseInput& input,
-    MapCloseOutput& output)
-//--------------------------------------------------------------------------
-{
-  // Map everything with composite instances for now
-  output.chosen_instances.push_back(PhysicalInstance::get_virtual_instance());
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::select_close_sources(
-    const MapperContext ctx, const Close& close,
-    const SelectCloseSrcInput& input, SelectCloseSrcOutput& output)
-//--------------------------------------------------------------------------
-{
-  triton_select_sources(
-      ctx, input.target, input.source_instances, output.chosen_ranking);
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::report_profiling(
-    const MapperContext ctx, const Close& close,
-    const CloseProfilingInfo& input)
-//--------------------------------------------------------------------------
-{
-  // No profiling yet for triton
-  abort();
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::select_sharding_functor(
-    const MapperContext ctx, const Close& close,
-    const SelectShardingFunctorInput& input,
-    SelectShardingFunctorOutput& output)
-//--------------------------------------------------------------------------
-{
-  abort();
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::map_acquire(
-    const MapperContext ctx, const Acquire& acquire,
-    const MapAcquireInput& input, MapAcquireOutput& output)
-//--------------------------------------------------------------------------
-{
-  // Nothing to do
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::speculate(
-    const MapperContext ctx, const Acquire& acquire, SpeculativeOutput& output)
-//--------------------------------------------------------------------------
-{
-  output.speculate = false;
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::report_profiling(
-    const MapperContext ctx, const Acquire& acquire,
-    const AcquireProfilingInfo& input)
-//--------------------------------------------------------------------------
-{
-  // No profiling for triton yet
-  abort();
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::select_sharding_functor(
-    const MapperContext ctx, const Acquire& acquire,
-    const SelectShardingFunctorInput& input,
-    SelectShardingFunctorOutput& output)
-//--------------------------------------------------------------------------
-{
-  abort();
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::map_release(
-    const MapperContext ctx, const Release& release,
-    const MapReleaseInput& input, MapReleaseOutput& output)
-//--------------------------------------------------------------------------
-{
-  // Nothing to do
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::select_release_sources(
-    const MapperContext ctx, const Release& release,
-    const SelectReleaseSrcInput& input, SelectReleaseSrcOutput& output)
-//--------------------------------------------------------------------------
-{
-  triton_select_sources(
-      ctx, input.target, input.source_instances, output.chosen_ranking);
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::speculate(
-    const MapperContext ctx, const Release& release, SpeculativeOutput& output)
-//--------------------------------------------------------------------------
-{
-  output.speculate = false;
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::report_profiling(
-    const MapperContext ctx, const Release& release,
-    const ReleaseProfilingInfo& input)
-//--------------------------------------------------------------------------
-{
-  // No profiling for triton yet
-  abort();
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::select_sharding_functor(
-    const MapperContext ctx, const Release& release,
-    const SelectShardingFunctorInput& input,
-    SelectShardingFunctorOutput& output)
-//--------------------------------------------------------------------------
-{
-  abort();
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::select_partition_projection(
-    const MapperContext ctx, const Partition& partition,
-    const SelectPartitionProjectionInput& input,
-    SelectPartitionProjectionOutput& output)
-//--------------------------------------------------------------------------
-{
-  // If we have an open complete partition then use it
-  if (!input.open_complete_partitions.empty())
-    output.chosen_partition = input.open_complete_partitions[0];
-  else
-    output.chosen_partition = LogicalPartition::NO_PART;
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::map_partition(
-    const MapperContext ctx, const Partition& partition,
-    const MapPartitionInput& input, MapPartitionOutput& output)
-//--------------------------------------------------------------------------
-{
-  const RegionRequirement& req = partition.requirement;
-  output.chosen_instances.resize(req.privilege_fields.size());
-  const std::vector<PhysicalInstance>& valid = input.valid_instances;
-  std::vector<PhysicalInstance> needed_acquires;
-  unsigned fidx = 0;
-  for (std::set<FieldID>::const_iterator it = req.privilege_fields.begin();
-       it != req.privilege_fields.end(); it++) {
-    if (find_existing_instance(
-            req.region, *it, local_system_memory,
-            output.chosen_instances[fidx]) ||
-        map_tensor(
-            ctx, partition, 0, req.region, *it, local_system_memory,
-            Processor::NO_PROC, valid, output.chosen_instances[fidx])) {
-      needed_acquires.push_back(output.chosen_instances[fidx]);
-    }
-  }
-  while (!needed_acquires.empty() &&
-         !runtime->acquire_and_filter_instances(
-             ctx, needed_acquires, true /*filter on acquire*/)) {
-    assert(!needed_acquires.empty());
-    std::set<PhysicalInstance> failed_instances;
-    filter_failed_acquires(needed_acquires, failed_instances);
-    // Now go through all the fields for the instances and try and remap
-    std::set<FieldID>::const_iterator fit = req.privilege_fields.begin();
-    for (unsigned idx = 0; idx < output.chosen_instances.size(); idx++, fit++) {
-      if (failed_instances.find(output.chosen_instances[idx]) ==
-          failed_instances.end())
-        continue;
-      // Now try to remap it
-      if (map_tensor(
-              ctx, partition, 0 /*idx*/, req.region, *fit, local_system_memory,
-              Processor::NO_PROC, valid, output.chosen_instances[idx]))
-        needed_acquires.push_back(output.chosen_instances[idx]);
-    }
-  }
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::select_partition_sources(
-    const MapperContext ctx, const Partition& partition,
-    const SelectPartitionSrcInput& input, SelectPartitionSrcOutput& output)
-//--------------------------------------------------------------------------
-{
-  triton_select_sources(
-      ctx, input.target, input.source_instances, output.chosen_ranking);
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::report_profiling(
-    const MapperContext ctx, const Partition& partition,
-    const PartitionProfilingInfo& input)
-//--------------------------------------------------------------------------
-{
-  // No profiling yet
-  abort();
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::select_sharding_functor(
-    const MapperContext ctx, const Partition& partition,
-    const SelectShardingFunctorInput& input,
-    SelectShardingFunctorOutput& output)
-//--------------------------------------------------------------------------
-{
-  output.chosen_functor = find_sharding_functor(partition)->sharding_id;
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::select_sharding_functor(
-    const MapperContext ctx, const Fill& fill,
-    const SelectShardingFunctorInput& input,
-    SelectShardingFunctorOutput& output)
-//--------------------------------------------------------------------------
-{
-  output.chosen_functor = find_sharding_functor(fill)->sharding_id;
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::configure_context(
-    const MapperContext ctx, const Task& task, ContextConfigOutput& output)
-//--------------------------------------------------------------------------
-{
-  // Use the defaults currently
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::pack_tunable(
-    const int value, Mapper::SelectTunableOutput& output)
-//--------------------------------------------------------------------------
-{
-  int* result = (int*)malloc(sizeof(value));
-  *result = value;
-  output.value = result;
-  output.size = sizeof(value);
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::select_tunable_value(
-    const MapperContext ctx, const Task& task, const SelectTunableInput& input,
-    SelectTunableOutput& output)
-//--------------------------------------------------------------------------
-{
-  // No tunable values at the moment
-  abort();
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::select_sharding_functor(
-    const MapperContext ctx, const MustEpoch& epoch,
-    const SelectShardingFunctorInput& input,
-    MustEpochShardingFunctorOutput& output)
-//--------------------------------------------------------------------------
-{
-  // No must epoch launches in trition
-  abort();
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::memoize_operation(
-    const MapperContext ctx, const Mappable& mappable,
-    const MemoizeInput& input, MemoizeOutput& output)
-//--------------------------------------------------------------------------
-{
-  output.memoize = true;
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::map_must_epoch(
-    const MapperContext ctx, const MapMustEpochInput& input,
-    MapMustEpochOutput& output)
-//--------------------------------------------------------------------------
-{
-  // No must epoch launches in triton
-  abort();
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::map_dataflow_graph(
-    const MapperContext ctx, const MapDataflowGraphInput& input,
-    MapDataflowGraphOutput& output)
-//--------------------------------------------------------------------------
-{
-  // Not supported yet
-  abort();
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::select_tasks_to_map(
-    const MapperContext ctx, const SelectMappingInput& input,
-    SelectMappingOutput& output)
-//--------------------------------------------------------------------------
-{
-  // Just map all the ready tasks
-  for (std::list<const Task*>::const_iterator it = input.ready_tasks.begin();
-       it != input.ready_tasks.end(); it++)
-    output.map_tasks.insert(*it);
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::select_steal_targets(
-    const MapperContext ctx, const SelectStealingInput& input,
-    SelectStealingOutput& output)
-//--------------------------------------------------------------------------
-{
-  // Nothing to do, no stealing in the leagte mapper currently
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::permit_steal_request(
-    const MapperContext ctx, const StealRequestInput& input,
-    StealRequestOutput& output)
-//--------------------------------------------------------------------------
-{
-  // Nothing to do, no stealing in the triton mapper currently
-  abort();
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::handle_message(
-    const MapperContext ctx, const MapperMessage& message)
-//--------------------------------------------------------------------------
-{
-  // We shouldn't be receiving any messages currently
-  abort();
-}
-
-//--------------------------------------------------------------------------
-void
-StrategyMapper::handle_task_result(
-    const MapperContext ctx, const MapperTaskResult& result)
-//--------------------------------------------------------------------------
-{
-  // Nothing to do since we should never get one of these
-  abort();
-}
-
-}}}  // namespace triton::backend::legion
diff --git a/triton/src/strategy.h b/triton/src/strategy.h
deleted file mode 100644
index c79d4ed4b1..0000000000
--- a/triton/src/strategy.h
+++ /dev/null
@@ -1,493 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __LEGION_TRITON_STRATEGY_H__
-#define __LEGION_TRITON_STRATEGY_H__
-
-#include "config.h"
-#include "legion.h"
-#include "legion/legion_mapping.h"
-#include "mappers/null_mapper.h"
-#include "types.h"
-
-namespace triton { namespace backend { namespace legion {
-
-struct LayerStrategy;
-
-class ShardingFunction : public Legion::ShardingFunctor {
- public:
-  ShardingFunction(Legion::ShardingID sid, const LayerStrategy* strategy);
-
- public:
-  Realm::Processor find_proc(
-      const Legion::DomainPoint& point, const Legion::Domain& domain);
-  Legion::ShardID shard(
-      const Legion::DomainPoint& point, const Legion::Domain& full_space,
-      const size_t total_size);
-
- public:
-  const Legion::ShardingID sharding_id;
-  const LayerStrategy* const strategy;
-};
-
-struct LayerStrategy {
- public:
-  LayerStrategy(
-      Legion::ShardingID sid, Legion::MappingTagID tag,
-      Legion::Runtime* runtime);
-  ~LayerStrategy(void);
-
- public:
-  Legion::Domain get_launch_domain(void) const;
-  // 'global' domain should be inclusive
-  Legion::Domain find_local_domain(
-      Realm::Processor proc, const Legion::Domain& global) const;
-  bool is_local_processor(Realm::Processor proc) const;
-  unsigned find_local_offset(Realm::Processor proc) const;
-  Legion::DomainPoint find_local_point(Realm::Processor proc) const;
-
- public:
-  ShardingFunction* const sharding_function;
-  const Legion::MappingTagID tag;
-
- public:
-  Realm::Processor::Kind kind;
-  int nDims, dim[LEGION_MAX_DIM];
-  unsigned nProcs;
-  Realm::Processor local_processors[MAX_LOCAL_PROCS];
-  Legion::DomainPoint local_points[MAX_LOCAL_PROCS];
-  std::vector<Realm::Processor> global_processors;
-};
-
-struct PartitionStrategy {
- public:
-  PartitionStrategy(
-      LegionModelState* model, std::vector<const LayerStrategy*>&& layers)
-      : layers(std::move(layers))
-  {
-  }
-  ~PartitionStrategy(void);
-
- public:
-  const std::vector<const LayerStrategy*> layers;
-
- public:
-  static PartitionStrategy* LoadStrategy(
-      const std::string& filename, LegionModelState* model);
-};
-
-class StrategyMapper : public Legion::Mapping::Mapper {
- public:
-  struct FieldMemInfo {
-   public:
-    FieldMemInfo(void) {}
-    FieldMemInfo(Legion::RegionTreeID t, Legion::FieldID f, Legion::Memory m)
-        : tid(t), fid(f), memory(m)
-    {
-    }
-
-   public:
-    inline bool operator==(const FieldMemInfo& rhs) const
-    {
-      if (tid != rhs.tid)
-        return false;
-      if (fid != rhs.fid)
-        return false;
-      if (memory != rhs.memory)
-        return false;
-      return true;
-    }
-    inline bool operator<(const FieldMemInfo& rhs) const
-    {
-      if (tid < rhs.tid)
-        return true;
-      if (tid > rhs.tid)
-        return false;
-      if (fid < rhs.fid)
-        return true;
-      if (fid > rhs.fid)
-        return false;
-      return memory < rhs.memory;
-    }
-
-   public:
-    Legion::RegionTreeID tid;
-    Legion::FieldID fid;
-    Legion::Memory memory;
-  };
-  struct InstanceInfo {
-   public:
-    InstanceInfo(void) {}
-    InstanceInfo(
-        Legion::LogicalRegion r, const Legion::Domain& b,
-        Legion::Mapping::PhysicalInstance inst)
-        : instance(inst), bounding_box(b)
-    {
-      regions.push_back(r);
-    }
-
-   public:
-    Legion::Mapping::PhysicalInstance instance;
-    Legion::Domain bounding_box;
-    std::vector<Legion::LogicalRegion> regions;
-  };
-  struct InstanceInfos {
-   public:
-    inline bool has_instance(
-        Legion::LogicalRegion region,
-        Legion::Mapping::PhysicalInstance& result) const
-    {
-      std::map<Legion::LogicalRegion, unsigned>::const_iterator finder =
-          region_mapping.find(region);
-      if (finder == region_mapping.end())
-        return false;
-      const InstanceInfo& info = instances[finder->second];
-      result = info.instance;
-      return true;
-    }
-
-   public:
-    inline unsigned insert(
-        Legion::LogicalRegion region, const Legion::Domain& bound,
-        Legion::Mapping::PhysicalInstance inst)
-    {
-      unsigned index = instances.size();
-      for (unsigned idx = 0; idx < instances.size(); idx++) {
-        if (inst != instances[idx].instance)
-          continue;
-        index = idx;
-        break;
-      }
-      if (index == instances.size())
-        instances.push_back(InstanceInfo(region, bound, inst));
-      region_mapping[region] = index;
-      return index;
-    }
-    inline bool filter(const Legion::Mapping::PhysicalInstance& inst)
-    {
-      for (unsigned idx = 0; idx < instances.size(); idx++) {
-        if (instances[idx].instance != inst)
-          continue;
-        // We also need to update any of the other region mappings
-        for (std::map<Legion::LogicalRegion, unsigned>::iterator it =
-                 region_mapping.begin();
-             it != region_mapping.end();
-             /*nothing*/) {
-          if (it->second == idx) {
-            std::map<Legion::LogicalRegion, unsigned>::iterator to_delete =
-                it++;
-            region_mapping.erase(to_delete);
-          } else {
-            if (it->second > idx)
-              it->second--;
-            it++;
-          }
-        }
-        instances.erase(instances.begin() + idx);
-        break;
-      }
-      return instances.empty();
-    }
-
-   public:
-    // A list of instances that we have for this field in this memory
-    std::vector<InstanceInfo> instances;
-    // Mapping for logical regions that we already know have instances
-    std::map<Legion::LogicalRegion, unsigned> region_mapping;
-  };
-
- public:
-  StrategyMapper(
-      const PartitionStrategy* strategy,
-      Legion::Mapping::MapperRuntime* runtime, Legion::Machine machine);
-  virtual ~StrategyMapper(void);
-
- protected:
-  // Start-up methods
-  static Legion::AddressSpaceID get_local_node(void);
-  static size_t get_total_nodes(Legion::Machine m);
-  static const char* create_name(Legion::AddressSpace node);
-
- public:
-  virtual const char* get_mapper_name(void) const;
-  virtual Legion::Mapping::Mapper::MapperSyncModel get_mapper_sync_model(
-      void) const;
-  virtual bool request_valid_instances(void) const { return false; }
-
- public:  // Task mapping calls
-  virtual void select_task_options(
-      const Legion::Mapping::MapperContext ctx, const Legion::Task& task,
-      TaskOptions& output);
-  virtual void premap_task(
-      const Legion::Mapping::MapperContext ctx, const Legion::Task& task,
-      const PremapTaskInput& input, PremapTaskOutput& output);
-  virtual void slice_task(
-      const Legion::Mapping::MapperContext ctx, const Legion::Task& task,
-      const SliceTaskInput& input, SliceTaskOutput& output);
-  virtual void map_task(
-      const Legion::Mapping::MapperContext ctx, const Legion::Task& task,
-      const MapTaskInput& input, MapTaskOutput& output);
-  virtual void map_replicate_task(
-      const Legion::Mapping::MapperContext ctx, const Legion::Task& task,
-      const MapTaskInput& input, const MapTaskOutput& default_output,
-      MapReplicateTaskOutput& output);
-  virtual void select_task_variant(
-      const Legion::Mapping::MapperContext ctx, const Legion::Task& task,
-      const SelectVariantInput& input, SelectVariantOutput& output);
-  virtual void postmap_task(
-      const Legion::Mapping::MapperContext ctx, const Legion::Task& task,
-      const PostMapInput& input, PostMapOutput& output);
-  virtual void select_task_sources(
-      const Legion::Mapping::MapperContext ctx, const Legion::Task& task,
-      const SelectTaskSrcInput& input, SelectTaskSrcOutput& output);
-  virtual void speculate(
-      const Legion::Mapping::MapperContext ctx, const Legion::Task& task,
-      SpeculativeOutput& output);
-  virtual void report_profiling(
-      const Legion::Mapping::MapperContext ctx, const Legion::Task& task,
-      const TaskProfilingInfo& input);
-  virtual void select_sharding_functor(
-      const Legion::Mapping::MapperContext ctx, const Legion::Task& task,
-      const SelectShardingFunctorInput& input,
-      SelectShardingFunctorOutput& output);
-  ShardingFunction* find_sharding_functor(const Legion::Mappable& mappable);
-
- public:  // Inline mapping calls
-  virtual void map_inline(
-      const Legion::Mapping::MapperContext ctx,
-      const Legion::InlineMapping& inline_op, const MapInlineInput& input,
-      MapInlineOutput& output);
-  virtual void select_inline_sources(
-      const Legion::Mapping::MapperContext ctx,
-      const Legion::InlineMapping& inline_op, const SelectInlineSrcInput& input,
-      SelectInlineSrcOutput& output);
-  virtual void report_profiling(
-      const Legion::Mapping::MapperContext ctx,
-      const Legion::InlineMapping& inline_op, const InlineProfilingInfo& input);
-
- public:  // Copy mapping calls
-  virtual void map_copy(
-      const Legion::Mapping::MapperContext ctx, const Legion::Copy& copy,
-      const MapCopyInput& input, MapCopyOutput& output);
-  virtual void select_copy_sources(
-      const Legion::Mapping::MapperContext ctx, const Legion::Copy& copy,
-      const SelectCopySrcInput& input, SelectCopySrcOutput& output);
-  virtual void speculate(
-      const Legion::Mapping::MapperContext ctx, const Legion::Copy& copy,
-      SpeculativeOutput& output);
-  virtual void report_profiling(
-      const Legion::Mapping::MapperContext ctx, const Legion::Copy& copy,
-      const CopyProfilingInfo& input);
-  virtual void select_sharding_functor(
-      const Legion::Mapping::MapperContext ctx, const Legion::Copy& copy,
-      const SelectShardingFunctorInput& input,
-      SelectShardingFunctorOutput& output);
-
- public:  // Close mapping calls
-  virtual void map_close(
-      const Legion::Mapping::MapperContext ctx, const Legion::Close& close,
-      const MapCloseInput& input, MapCloseOutput& output);
-  virtual void select_close_sources(
-      const Legion::Mapping::MapperContext ctx, const Legion::Close& close,
-      const SelectCloseSrcInput& input, SelectCloseSrcOutput& output);
-  virtual void report_profiling(
-      const Legion::Mapping::MapperContext ctx, const Legion::Close& close,
-      const CloseProfilingInfo& input);
-  virtual void select_sharding_functor(
-      const Legion::Mapping::MapperContext ctx, const Legion::Close& close,
-      const SelectShardingFunctorInput& input,
-      SelectShardingFunctorOutput& output);
-
- public:  // Acquire mapping calls
-  virtual void map_acquire(
-      const Legion::Mapping::MapperContext ctx, const Legion::Acquire& acquire,
-      const MapAcquireInput& input, MapAcquireOutput& output);
-  virtual void speculate(
-      const Legion::Mapping::MapperContext ctx, const Legion::Acquire& acquire,
-      SpeculativeOutput& output);
-  virtual void report_profiling(
-      const Legion::Mapping::MapperContext ctx, const Legion::Acquire& acquire,
-      const AcquireProfilingInfo& input);
-  virtual void select_sharding_functor(
-      const Legion::Mapping::MapperContext ctx, const Legion::Acquire& acquire,
-      const SelectShardingFunctorInput& input,
-      SelectShardingFunctorOutput& output);
-
- public:  // Release mapping calls
-  virtual void map_release(
-      const Legion::Mapping::MapperContext ctx, const Legion::Release& release,
-      const MapReleaseInput& input, MapReleaseOutput& output);
-  virtual void select_release_sources(
-      const Legion::Mapping::MapperContext ctx, const Legion::Release& release,
-      const SelectReleaseSrcInput& input, SelectReleaseSrcOutput& output);
-  virtual void speculate(
-      const Legion::Mapping::MapperContext ctx, const Legion::Release& release,
-      SpeculativeOutput& output);
-  virtual void report_profiling(
-      const Legion::Mapping::MapperContext ctx, const Legion::Release& release,
-      const ReleaseProfilingInfo& input);
-  virtual void select_sharding_functor(
-      const Legion::Mapping::MapperContext ctx, const Legion::Release& release,
-      const SelectShardingFunctorInput& input,
-      SelectShardingFunctorOutput& output);
-
- public:  // Partition mapping calls
-  virtual void select_partition_projection(
-      const Legion::Mapping::MapperContext ctx,
-      const Legion::Partition& partition,
-      const SelectPartitionProjectionInput& input,
-      SelectPartitionProjectionOutput& output);
-  virtual void map_partition(
-      const Legion::Mapping::MapperContext ctx,
-      const Legion::Partition& partition, const MapPartitionInput& input,
-      MapPartitionOutput& output);
-  virtual void select_partition_sources(
-      const Legion::Mapping::MapperContext ctx,
-      const Legion::Partition& partition, const SelectPartitionSrcInput& input,
-      SelectPartitionSrcOutput& output);
-  virtual void report_profiling(
-      const Legion::Mapping::MapperContext ctx,
-      const Legion::Partition& partition, const PartitionProfilingInfo& input);
-  virtual void select_sharding_functor(
-      const Legion::Mapping::MapperContext ctx,
-      const Legion::Partition& partition,
-      const SelectShardingFunctorInput& input,
-      SelectShardingFunctorOutput& output);
-
- public:  // Fill mapper calls
-  virtual void select_sharding_functor(
-      const Legion::Mapping::MapperContext ctx, const Legion::Fill& fill,
-      const SelectShardingFunctorInput& input,
-      SelectShardingFunctorOutput& output);
-
- public:  // Task execution mapping calls
-  virtual void configure_context(
-      const Legion::Mapping::MapperContext ctx, const Legion::Task& task,
-      ContextConfigOutput& output);
-  virtual void select_tunable_value(
-      const Legion::Mapping::MapperContext ctx, const Legion::Task& task,
-      const SelectTunableInput& input, SelectTunableOutput& output);
-
- public:  // Must epoch mapping
-  virtual void select_sharding_functor(
-      const Legion::Mapping::MapperContext ctx, const Legion::MustEpoch& epoch,
-      const SelectShardingFunctorInput& input,
-      MustEpochShardingFunctorOutput& output);
-  virtual void memoize_operation(
-      const Legion::Mapping::MapperContext ctx,
-      const Legion::Mappable& mappable, const MemoizeInput& input,
-      MemoizeOutput& output);
-  virtual void map_must_epoch(
-      const Legion::Mapping::MapperContext ctx, const MapMustEpochInput& input,
-      MapMustEpochOutput& output);
-
- public:  // Dataflow graph mapping
-  virtual void map_dataflow_graph(
-      const Legion::Mapping::MapperContext ctx,
-      const MapDataflowGraphInput& input, MapDataflowGraphOutput& output);
-
- public:  // Mapping control and stealing
-  virtual void select_tasks_to_map(
-      const Legion::Mapping::MapperContext ctx, const SelectMappingInput& input,
-      SelectMappingOutput& output);
-  virtual void select_steal_targets(
-      const Legion::Mapping::MapperContext ctx,
-      const SelectStealingInput& input, SelectStealingOutput& output);
-  virtual void permit_steal_request(
-      const Legion::Mapping::MapperContext ctx, const StealRequestInput& intput,
-      StealRequestOutput& output);
-
- public:  // handling
-  virtual void handle_message(
-      const Legion::Mapping::MapperContext ctx, const MapperMessage& message);
-  virtual void handle_task_result(
-      const Legion::Mapping::MapperContext ctx, const MapperTaskResult& result);
-
- protected:
-  bool find_existing_instance(
-      Legion::LogicalRegion region, Legion::FieldID fid,
-      Legion::Memory target_memory, Legion::Mapping::PhysicalInstance& result);
-  bool map_tensor(
-      const Legion::Mapping::MapperContext ctx,
-      const Legion::Mappable& mappable, unsigned index,
-      Legion::LogicalRegion region, Legion::FieldID fid,
-      Legion::Memory target_memory, Legion::Processor target_proc,
-      const std::vector<Legion::Mapping::PhysicalInstance>& valid,
-      Legion::Mapping::PhysicalInstance& result,
-      Legion::ReductionOpID redop = 0);
-  void filter_failed_acquires(
-      std::vector<Legion::Mapping::PhysicalInstance>& needed_acquires,
-      std::set<Legion::Mapping::PhysicalInstance>& failed_acquires);
-  void report_failed_mapping(
-      const Legion::Mappable& mappable, unsigned index,
-      Legion::Memory target_memory, Legion::ReductionOpID redop);
-  void triton_select_sources(
-      const Legion::Mapping::MapperContext ctx,
-      const Legion::Mapping::PhysicalInstance& target,
-      const std::vector<Legion::Mapping::PhysicalInstance>& sources,
-      std::deque<Legion::Mapping::PhysicalInstance>& ranking);
-  bool has_variant(
-      const Legion::Mapping::MapperContext ctx, const Legion::Task& task,
-      Legion::Processor::Kind kind);
-  Legion::VariantID find_variant(
-      const Legion::Mapping::MapperContext ctx, const Legion::Task& task);
-  Legion::VariantID find_variant(
-      const Legion::Mapping::MapperContext ctx, const Legion::Task& task,
-      Legion::Processor target_proc);
-  void pack_tunable(const int value, Mapper::SelectTunableOutput& output);
-
- protected:
-  static inline bool physical_sort_func(
-      const std::pair<Legion::Mapping::PhysicalInstance, unsigned>& left,
-      const std::pair<Legion::Mapping::PhysicalInstance, unsigned>& right)
-  {
-    return (left.second < right.second);
-  }
-
- public:
-  const PartitionStrategy* const strategy;
-  const Legion::Machine machine;
-  const Legion::AddressSpace local_node;
-  const size_t total_nodes;
-  const char* const mapper_name;
-
- protected:
-  std::vector<Legion::Processor> local_cpus;
-  std::vector<Legion::Processor> local_gpus;
-  std::vector<Legion::Processor> local_omps;  // OpenMP processors
-  std::vector<Legion::Processor> local_ios;   // I/O processors
-  std::vector<Legion::Processor> local_pys;   // Python processors
- protected:
-  Legion::Memory local_system_memory, local_zerocopy_memory;
-  std::map<Legion::Processor, Legion::Memory> local_frame_buffers;
-  std::map<Legion::Processor, Legion::Memory> local_numa_domains;
-
- protected:
-  std::map<
-      std::pair<Legion::TaskID, Legion::Processor::Kind>, Legion::VariantID>
-      used_variants;
-
- protected:
-  std::map<FieldMemInfo, InstanceInfos> local_instances;
-
- protected:
-  // These are used for computing sharding functions
-  std::map<Legion::IndexPartition, unsigned> partition_color_space_dims;
-  std::map<Legion::IndexSpace, unsigned> index_color_dims;
-};
-
-}}}  // namespace triton::backend::legion
-
-#endif  // __LEGION_TRITON_STRATEGY_H__
diff --git a/triton/src/tensor.cc b/triton/src/tensor.cc
deleted file mode 100644
index 9cfee0959f..0000000000
--- a/triton/src/tensor.cc
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "tensor.h"
-#include "operator.h"
-
-using namespace Legion;
-
-namespace triton { namespace backend { namespace legion {
-
-Tensor::Tensor(Operator* op, DataType t, const size_t* dims, size_t num_dims)
-    : owner(op), type(t), bounds(dims, dims + num_dims)
-{
-  for (unsigned idx = 0; idx < MAX_NUM_INSTANCES; idx++) {
-    region[idx] = LogicalRegion::NO_REGION;
-    partition[idx] = LogicalPartition::NO_PART;
-  }
-}
-
-Tensor::Tensor(Operator* op, DataType t, const std::vector<size_t>& dims)
-    : owner(op), type(t), bounds(dims)
-{
-  for (unsigned idx = 0; idx < MAX_NUM_INSTANCES; idx++) {
-    region[idx] = LogicalRegion::NO_REGION;
-    partition[idx] = LogicalPartition::NO_PART;
-  }
-}
-
-Tensor::~Tensor(void) {}
-
-Weights::Weights(Operator* op, DataType t, const size_t* dims, size_t num_dims)
-    : Tensor(op, t, dims, num_dims)
-{
-  const Memory local_sysmem = op->model->runtime_->local_sysmem_;
-  for (size_t idx = 0; idx < MAX_LOCAL_PROCS; ++idx) {
-    local_memory[idx] = local_sysmem;
-    local_allocation[idx] = nullptr;
-  }
-}
-
-Weights::Weights(Operator* op, DataType t, const std::vector<size_t>& dims)
-    : Tensor(op, t, dims)
-{
-  const Memory local_sysmem = op->model->runtime_->local_sysmem_;
-  for (size_t idx = 0; idx < MAX_LOCAL_PROCS; ++idx) {
-    local_memory[idx] = local_sysmem;
-    local_allocation[idx] = nullptr;
-  }
-}
-
-Weights::~Weights(void)
-{
-  for (size_t idx = 0; idx < MAX_LOCAL_PROCS; ++idx) {
-    assert(local_allocation[idx] == nullptr);
-  }
-}
-
-}}}  // namespace triton::backend::legion
diff --git a/triton/src/tensor.h b/triton/src/tensor.h
deleted file mode 100644
index c8126b563f..0000000000
--- a/triton/src/tensor.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __LEGION_TRITON_TENSOR_H__
-#define __LEGION_TRITON_TENSOR_H__
-
-#include "config.h"
-#include "legion.h"
-#include "types.h"
-
-namespace triton { namespace backend { namespace legion {
-
-class Tensor {
- public:
-  Tensor(Operator* op, DataType type, const size_t* dims, size_t num_dims);
-  Tensor(Operator* op, DataType type, const std::vector<size_t>& dims);
-  virtual ~Tensor(void);
-
- public:
-  Operator* const owner;
-  const DataType type;
-  const std::vector<size_t> bounds;
-
- public:
-  Legion::LogicalRegion region[MAX_NUM_INSTANCES];
-  Legion::LogicalPartition partition[MAX_NUM_INSTANCES];
-};
-
-class Weights : public Tensor {
- public:
-  Weights(Operator* op, DataType type, const size_t* dims, size_t num_dims);
-  Weights(Operator* op, DataType type, const std::vector<size_t>& dims);
-  virtual ~Weights(void);
-
- public:
-  Legion::Domain local_bounds[MAX_LOCAL_PROCS];
-  Legion::Memory local_memory[MAX_LOCAL_PROCS];
-  void* local_allocation[MAX_LOCAL_PROCS];
-  size_t local_strides[MAX_LOCAL_PROCS][LEGION_MAX_DIM];
-};
-
-}}}  // namespace triton::backend::legion
-
-#endif  // __LEGION_TRITON_TENSOR_H__
diff --git a/triton/src/test/CMakeLists.txt b/triton/src/test/CMakeLists.txt
deleted file mode 100644
index d722f4081a..0000000000
--- a/triton/src/test/CMakeLists.txt
+++ /dev/null
@@ -1,104 +0,0 @@
-#------------------------------------------------------------------------------#
-# Copyright 2022 NVIDIA CORPORATION
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#------------------------------------------------------------------------------#
-
-cmake_minimum_required (VERSION 3.18)
-
-#
-# ONNX Parser
-#
-# Use customized protoc command to generate cpp files with proper layout
-set(PROTO_SRCS onnx/onnx-data.pb.cc onnx/onnx-ml.pb.cc onnx/onnx-operators-ml.pb.cc)
-add_custom_command(
-  OUTPUT ${PROTO_SRCS}
-  ALL
-  COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
-    -I${CMAKE_CURRENT_SOURCE_DIR}/.. --cpp_out=${CMAKE_CURRENT_BINARY_DIR}
-    ${CMAKE_CURRENT_SOURCE_DIR}/../onnx/*.proto
-  WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-  COMMENT "Compiling cpp files of the ONNX protos"
-)
-set(
-  ONNX_PARSER_SRCS
-  ${PROTO_SRCS}
-  ../onnx_parser.cc
-  ../tensor.cc
-  ../operator.cc
-)
-
-set(
-  ONNX_PARSER_HDRS
-  ../onnx_parser.h
-)
-
-file(GLOB MOCK_SRCS mock/* )
-set(
-  ONNX_PARSER_MOCK_SRCS
-  ${MOCK_SRCS}
-)
-
-set(
-  ONNX_PARSER_TEST_SRCS
-  onnx_parser_test.cc
-  ${ONNX_PARSER_SRCS}
-  ${ONNX_PARSER_MOCK_SRCS}
-)
-
-set(
-  ONNX_PARSER_TEST_HDRS
-  ${ONNX_PARSER_HDRS}
-)
-
-find_package(GTest REQUIRED)
-add_executable(
-  onnx_parser_test
-  ${ONNX_PARSER_TEST_SRCS}
-  ${ONNX_PARSER_TEST_HDRS}
-)
-set_target_properties(
-  onnx_parser_test
-  PROPERTIES
-    SKIP_BUILD_RPATH TRUE
-    BUILD_WITH_INSTALL_RPATH TRUE
-    INSTALL_RPATH_USE_LINK_PATH FALSE
-    INSTALL_RPATH ""
-)
-target_include_directories(
-  onnx_parser_test
-  PRIVATE ${GTEST_INCLUDE_DIR}
-  PRIVATE ${LEGION_ROOT}/include # Not using target as we only want declarations
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/..
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}
-  PRIVATE ${CUDA_INCLUDE_DIRS}
-)
-target_link_libraries(
-  onnx_parser_test
-  PRIVATE triton-core-serverapi      # from repo-core
-  PRIVATE triton-backend-utils
-  PRIVATE ${GTEST_LIBRARY}
-  PRIVATE ${GTEST_MAIN_LIBRARY}
-  PRIVATE protobuf::libprotobuf
-  PRIVATE -L${CMAKE_CURRENT_SOURCE_DIR}/..
-)
-install(
-  TARGETS onnx_parser_test
-  RUNTIME DESTINATION test
-)
-
-# Test data
-install(
-  DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/data
-  DESTINATION test
-)
diff --git a/triton/src/test/data/add.onnx b/triton/src/test/data/add.onnx
deleted file mode 100644
index f6c18819c6..0000000000
--- a/triton/src/test/data/add.onnx
+++ /dev/null
@@ -1,17 +0,0 @@
-model:y
-
-input0
-input1output"Add
-test_graphZ
-input0
-
-
-Z
-input1
-
-
-b
-output
-
-
-B
\ No newline at end of file
diff --git a/triton/src/test/data/avg_pool.onnx b/triton/src/test/data/avg_pool.onnx
deleted file mode 100644
index 8e421e5e2a..0000000000
--- a/triton/src/test/data/avg_pool.onnx
+++ /dev/null
@@ -1,17 +0,0 @@
-model:�
-3
-inputoutput"AveragePool*
-kernel_shape@@�
-test_graphZ
-input
-
-
-
-
-b 
-output
-
-
-
-
-B
\ No newline at end of file
diff --git a/triton/src/test/data/avg_pool_autopad.onnx b/triton/src/test/data/avg_pool_autopad.onnx
deleted file mode 100644
index 6e4a1fb81a..0000000000
--- a/triton/src/test/data/avg_pool_autopad.onnx
+++ /dev/null
@@ -1,19 +0,0 @@
-model:�
-N
-inputoutput"AveragePool*
-auto_pad"
-SAME_LOWER�*
-kernel_shape@@�
-test_graphZ
-input
-
-
-
-
-b 
-output
-
-
-
-
-B
\ No newline at end of file
diff --git a/triton/src/test/data/avg_pool_ceil.onnx b/triton/src/test/data/avg_pool_ceil.onnx
deleted file mode 100644
index d8c56b27e7..0000000000
--- a/triton/src/test/data/avg_pool_ceil.onnx
+++ /dev/null
@@ -1,18 +0,0 @@
-model:�
-E
-inputoutput"AveragePool*
-	ceil_mode�*
-kernel_shape@@�
-test_graphZ
-input
-
-
-
-
-b 
-output
-
-
-
-
-B
\ No newline at end of file
diff --git a/triton/src/test/data/avg_pool_count_include_pad.onnx b/triton/src/test/data/avg_pool_count_include_pad.onnx
deleted file mode 100644
index 6ba2c8eaee..0000000000
--- a/triton/src/test/data/avg_pool_count_include_pad.onnx
+++ /dev/null
@@ -1,18 +0,0 @@
-model:�
-M
-inputoutput"AveragePool*
-count_include_pad�*
-kernel_shape@@�
-test_graphZ
-input
-
-
-
-
-b 
-output
-
-
-
-
-B
\ No newline at end of file
diff --git a/triton/src/test/data/avg_pool_pad.onnx b/triton/src/test/data/avg_pool_pad.onnx
deleted file mode 100644
index 599f029a25..0000000000
--- a/triton/src/test/data/avg_pool_pad.onnx
+++ /dev/null
@@ -1,19 +0,0 @@
-model:�
-X
-inputoutput"AveragePool*
-kernel_shape@@�*
-pads@@@@�*
-strides@@�
-test_graphZ
-input
-
-
-
-
-b 
-output
-
-
-
-
-B
\ No newline at end of file
diff --git a/triton/src/test/data/cast.onnx b/triton/src/test/data/cast.onnx
deleted file mode 100644
index e340fd2b94..0000000000
--- a/triton/src/test/data/cast.onnx
+++ /dev/null
@@ -1,13 +0,0 @@
-model:a
- 
-inputoutput"Cast*	
-to�
-test_graphZ
-input
-
-
-b
-output
-
-
-B
\ No newline at end of file
diff --git a/triton/src/test/data/conv2d_with_bias.onnx b/triton/src/test/data/conv2d_with_bias.onnx
deleted file mode 100644
index 9956153b0a..0000000000
Binary files a/triton/src/test/data/conv2d_with_bias.onnx and /dev/null differ
diff --git a/triton/src/test/data/identity.onnx b/triton/src/test/data/identity.onnx
deleted file mode 100644
index da9fc01ed0..0000000000
--- a/triton/src/test/data/identity.onnx
+++ /dev/null
@@ -1,16 +0,0 @@
-model:j
-
-inputoutput"Identity
-test_graphZ
-input
-
-
-
-
-b 
-output
-
-
-
-
-B
\ No newline at end of file
diff --git a/triton/src/test/data/max_pool.onnx b/triton/src/test/data/max_pool.onnx
deleted file mode 100644
index e9da2011e3..0000000000
--- a/triton/src/test/data/max_pool.onnx
+++ /dev/null
@@ -1,19 +0,0 @@
-model:�
-T
-inputoutput"MaxPool*
-kernel_shape@@�*
-pads@@@@�*
-strides@@�
-test_graphZ
-input
-
-
-
-
-b 
-output
-
-
-
-
-B
\ No newline at end of file
diff --git a/triton/src/test/data/max_pool_autopad.onnx b/triton/src/test/data/max_pool_autopad.onnx
deleted file mode 100644
index 42c6d8c664..0000000000
--- a/triton/src/test/data/max_pool_autopad.onnx
+++ /dev/null
@@ -1,19 +0,0 @@
-model:�
-J
-inputoutput"MaxPool*
-auto_pad"
-SAME_UPPER�*
-kernel_shape@@�
-test_graphZ
-input
-
-
-
-
-b 
-output
-
-
-
-
-B
\ No newline at end of file
diff --git a/triton/src/test/data/max_pool_ceil.onnx b/triton/src/test/data/max_pool_ceil.onnx
deleted file mode 100644
index 4bb3c3b264..0000000000
--- a/triton/src/test/data/max_pool_ceil.onnx
+++ /dev/null
@@ -1,19 +0,0 @@
-model:�
-S
-inputoutput"MaxPool*
-	ceil_mode�*
-kernel_shape@@�*
-strides@@�
-test_graphZ
-input
-
-
-
-
-b 
-output
-
-
-
-
-B
\ No newline at end of file
diff --git a/triton/src/test/data/max_pool_dilations.onnx b/triton/src/test/data/max_pool_dilations.onnx
deleted file mode 100644
index 171cb74b2b..0000000000
--- a/triton/src/test/data/max_pool_dilations.onnx
+++ /dev/null
@@ -1,19 +0,0 @@
-model:�
-U
-inputoutput"MaxPool*
-	dilations@@�*
-kernel_shape@@�*
-strides@@�
-test_graphZ
-input
-
-
-
-
-b 
-output
-
-
-
-
-B
\ No newline at end of file
diff --git a/triton/src/test/data/max_pool_order.onnx b/triton/src/test/data/max_pool_order.onnx
deleted file mode 100644
index e9173bb3c2..0000000000
--- a/triton/src/test/data/max_pool_order.onnx
+++ /dev/null
@@ -1,19 +0,0 @@
-model:�
-W
-inputoutput"MaxPool*
-kernel_shape@@�*
-storage_order�*
-strides@@�
-test_graphZ
-input
-
-
-
-
-b 
-output
-
-
-
-
-B
\ No newline at end of file
diff --git a/triton/src/test/data/mul.onnx b/triton/src/test/data/mul.onnx
deleted file mode 100644
index 1de5b71943..0000000000
--- a/triton/src/test/data/mul.onnx
+++ /dev/null
@@ -1,17 +0,0 @@
-model:y
-
-input0
-input1output"Mul
-test_graphZ
-input0
-
-
-Z
-input1
-
-
-b
-output
-
-
-B
\ No newline at end of file
diff --git a/triton/src/test/data/reciprocal.onnx b/triton/src/test/data/reciprocal.onnx
deleted file mode 100644
index 1eeb4c9103..0000000000
--- a/triton/src/test/data/reciprocal.onnx
+++ /dev/null
@@ -1,13 +0,0 @@
-model:\
-
-inputoutput"
-Reciprocal
-test_graphZ
-input
-
-
-b
-output
-
-
-B
\ No newline at end of file
diff --git a/triton/src/test/data/softmax.onnx b/triton/src/test/data/softmax.onnx
deleted file mode 100644
index 4a715b10b6..0000000000
Binary files a/triton/src/test/data/softmax.onnx and /dev/null differ
diff --git a/triton/src/test/data/softmax_default_axis.onnx b/triton/src/test/data/softmax_default_axis.onnx
deleted file mode 100644
index 8d74038976..0000000000
--- a/triton/src/test/data/softmax_default_axis.onnx
+++ /dev/null
@@ -1,12 +0,0 @@
-model:Y
-
-inputoutput"Softmax
-test_graphZ
-input
-
-
-b
-output
-
-
-B
\ No newline at end of file
diff --git a/triton/src/test/data/softmax_negative_axis.onnx b/triton/src/test/data/softmax_negative_axis.onnx
deleted file mode 100644
index eb69708caf..0000000000
--- a/triton/src/test/data/softmax_negative_axis.onnx
+++ /dev/null
@@ -1,13 +0,0 @@
-model:o
-.
-inputoutput"Softmax*
-axis����������
-test_graphZ
-input
-
-
-b
-output
-
-
-B
\ No newline at end of file
diff --git a/triton/src/test/data/sqrt.onnx b/triton/src/test/data/sqrt.onnx
deleted file mode 100644
index f1ded959b8..0000000000
--- a/triton/src/test/data/sqrt.onnx
+++ /dev/null
@@ -1,12 +0,0 @@
-model:V
-
-inputoutput"Sqrt
-test_graphZ
-input
-
-
-b
-output
-
-
-B
\ No newline at end of file
diff --git a/triton/src/test/data/sub.onnx b/triton/src/test/data/sub.onnx
deleted file mode 100644
index 43c7f2ad75..0000000000
--- a/triton/src/test/data/sub.onnx
+++ /dev/null
@@ -1,17 +0,0 @@
-model:y
-
-input0
-input1output"Sub
-test_graphZ
-input0
-
-
-Z
-input1
-
-
-b
-output
-
-
-B
\ No newline at end of file
diff --git a/triton/src/test/data/tanh.onnx b/triton/src/test/data/tanh.onnx
deleted file mode 100644
index 58a49d4422..0000000000
--- a/triton/src/test/data/tanh.onnx
+++ /dev/null
@@ -1,12 +0,0 @@
-model:V
-
-inputoutput"Tanh
-test_graphZ
-input
-
-
-b
-output
-
-
-B
\ No newline at end of file
diff --git a/triton/src/test/mock/binary.cc b/triton/src/test/mock/binary.cc
deleted file mode 100644
index 5759a85b16..0000000000
--- a/triton/src/test/mock/binary.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "operators/binary.h"
-
-namespace triton { namespace backend { namespace legion {
-
-BinaryOperator::BinaryOperator(
-    LegionModelState* model, const LayerStrategy* strategy, OperatorType type,
-    bool inplace_a, const char* name)
-    : Operator(model, strategy, type, name, 2, 0, 1), inplace(inplace_a)
-{
-}
-
-void
-BinaryOperator::Configure(Tensor* input0, Tensor* input1, Tensor* output)
-{
-  // Hack so that we can access the tensors in the tests
-  auto vec_ptr = reinterpret_cast<std::vector<Tensor*>*>(model);
-  vec_ptr->emplace_back(input0);
-  vec_ptr->emplace_back(input1);
-  vec_ptr->emplace_back(output);
-}
-
-Legion::Domain
-BinaryOperator::GetBounds(Realm::Processor proc)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-
-void
-BinaryOperator::Load(Realm::Processor processor)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-void
-BinaryOperator::initialize(
-    LegionModelInstance* instance, const unsigned instance_index,
-    Legion::Runtime* runtime, Legion::Context ctx, Legion::MapperID mapper)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-void
-BinaryOperator::forward(
-    LegionModelInstance* instance, const unsigned instance_index,
-    Legion::Runtime* runtime, Legion::Context ctx, Legion::MapperID mapper)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-void
-BinaryOperator::finalize(
-    LegionModelInstance* instance, const unsigned instance_index,
-    Legion::Runtime* runtime, Legion::Context ctx, Legion::MapperID mapper)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-void
-BinaryOperator::Free(Realm::Processor processor)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-
-void
-BinaryOperator::forward_cpu(
-    const Legion::Task* task,
-    const std::vector<Legion::PhysicalRegion>& regions, Legion::Context ctx,
-    Legion::Runtime* runtime)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-
-void
-BinaryOperator::PreregisterTaskVariants(void)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-
-}}}  // namespace triton::backend::legion
diff --git a/triton/src/test/mock/concat.cc b/triton/src/test/mock/concat.cc
deleted file mode 100644
index 3e81d767ff..0000000000
--- a/triton/src/test/mock/concat.cc
+++ /dev/null
@@ -1,182 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "operators/concat.h"
-
-using namespace Legion;
-
-namespace triton { namespace backend { namespace legion {
-
-ConcatArgs::ConcatArgs(void) : local_index(0), datatype(DT_NONE), axis(-1) {}
-
-Concat::Concat(
-    LegionModelState* model, const LayerStrategy* strategy, size_t inputs,
-    int ax, const char* name)
-    : Operator(model, strategy, OperatorType::OP_CONCAT, name, inputs, 0, 1),
-      axis(ax)
-{
-  assert(inputs > 0);
-}
-
-void
-Concat::Configure(const std::vector<Tensor*>& ins, Tensor* out)
-{
-  assert(num_inputs == ins.size());
-  inputs = ins;
-  size_t axis_size = 0;
-  const size_t dims = out->bounds.size();
-  assert(dims == strategy->nDims);
-  for (unsigned idx = 0; idx < inputs.size(); idx++) {
-    assert(inputs[idx]->type == out->type);
-    assert(inputs[idx]->bounds.size() == dims);
-    for (unsigned d = 0; d < dims; d++) {
-      if (d == axis)
-        axis_size += inputs[idx]->bounds[d];
-      else
-        assert(inputs[idx]->bounds[d] == out->bounds[d]);
-    }
-  }
-  assert(axis_size == out->bounds[axis]);
-  outputs.push_back(out);
-  // Figure out the output tiling domain
-  std::vector<size_t> tile_sizes(dims);
-  for (unsigned d = 0; d < dims; d++)
-    tile_sizes[d] = (out->bounds[d] + strategy->dim[d] - 1) / strategy->dim[d];
-  coord_t offset = 0;
-  // Now compute the domains and transforms needed for constructing
-  // the partitions for each of the inputs
-  input_color_spaces.resize(num_inputs);
-  input_extents.resize(num_inputs);
-  for (unsigned idx = 0; idx < num_inputs; idx++) {
-    DomainPoint lo, hi, color_lo, color_hi;
-    lo.dim = dims;
-    hi.dim = dims;
-    color_lo.dim = dims;
-    color_hi.dim = dims;
-    for (int d = 0; d < dims; d++) {
-      if (d == axis) {
-        const coord_t extent = inputs[idx]->bounds[d];
-        lo[d] = -offset;
-        hi[d] = (tile_sizes[d] - 1 /*inclusive*/) - offset;
-        color_lo[d] = offset / tile_sizes[d];
-        color_hi[d] = (offset + extent - 1) / tile_sizes[d];
-        offset += extent;
-      } else {
-        lo[d] = 0;
-        hi[d] = tile_sizes[d] - 1;  // make it inclusive
-        color_lo[d] = 0;
-        color_hi[d] = strategy->dim[d] - 1;  // make it inclusive
-      }
-    }
-    input_color_spaces[idx] = Domain(color_lo, color_hi);
-    input_extents[idx] = Domain(lo, hi);
-  }
-  // The input transform is the same across all the inputs
-  switch (dims) {
-#define DIMFUNC(N)                         \
-  case N: {                                \
-    Transform<N, N> transform;             \
-    for (int i = 0; i < N; i++)            \
-      for (int j = 0; j < N; j++)          \
-        if (i == j)                        \
-          transform[i][j] = tile_sizes[i]; \
-        else                               \
-          transform[i][j] = 0;             \
-    input_transform = transform;           \
-    break;                                 \
-  }
-    LEGION_FOREACH_N(DIMFUNC)
-#undef DIMFUNC
-    default:
-      abort();
-  }
-}
-
-Domain
-Concat::GetBounds(Processor proc)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-
-void
-Concat::Load(Processor proc)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-
-void
-Concat::initialize(
-    LegionModelInstance* instance, const unsigned instance_index,
-    Runtime* runtime, Context ctx, MapperID mapper)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-
-void
-Concat::forward(
-    LegionModelInstance* instance, const unsigned instance_index,
-    Runtime* runtime, Context ctx, MapperID mapper)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-
-void
-Concat::finalize(
-    LegionModelInstance* instance, const unsigned instance_index,
-    Runtime* runtime, Context ctx, MapperID mapper)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-
-void
-Concat::Free(Processor proc)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-
-/*static*/ void
-Concat::PreregisterTaskVariants(void)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-
-/*static*/ void
-Concat::forward_cpu(
-    const Task* task, const std::vector<PhysicalRegion>& regions, Context ctx,
-    Runtime* runtime)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-
-#ifdef LEGION_USE_CUDA
-/*static*/ void
-Concat::forward_gpu(
-    const Task* task, const std::vector<PhysicalRegion>& regions, Context ctx,
-    Runtime* runtime)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-#endif
-
-}}}  // namespace triton::backend::legion
diff --git a/triton/src/test/mock/conv2d.cc b/triton/src/test/mock/conv2d.cc
deleted file mode 100644
index ddc9c03b9b..0000000000
--- a/triton/src/test/mock/conv2d.cc
+++ /dev/null
@@ -1,164 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "operators/conv2d.h"
-
-using namespace Legion;
-
-namespace triton { namespace backend { namespace legion {
-
-Conv2D::Conv2D(
-    LegionModelState* model, const LayerStrategy* strategy, size_t inChannels,
-    size_t outChannels, size_t kernelH, size_t kernelW, size_t strideH,
-    size_t strideW, size_t paddingH, size_t paddingW, ActivationMode act,
-    size_t gps, bool bias, const char* name)
-    : Operator(
-          model, strategy, OP_CONV2D, name, 1 /*inputs*/,
-          bias ? 2 : 1 /*weights*/, 1 /*outputs*/),
-      activation(act), in_channels(inChannels), out_channels(outChannels),
-      kernel_h(kernelH), kernel_w(kernelW), stride_h(strideH),
-      stride_w(strideW), padding_h(paddingH), padding_w(paddingW), groups(gps),
-      use_bias(bias)
-{
-}
-
-Conv2D::~Conv2D() {}
-
-void
-Conv2D::Configure(Tensor* input, Weights* wts, Tensor* output, Weights* bias)
-{
-  assert(input != nullptr);
-  assert(in_channels == input->bounds[1]);
-  assert(wts != nullptr);
-  assert(output != nullptr);
-  if (use_bias)
-    assert(bias != nullptr);
-  else
-    assert(bias == nullptr);
-  inputs.push_back(input);
-  outputs.push_back(output);
-  weights.push_back(wts);
-  if (use_bias)
-    weights.push_back(bias);
-  // Hack so that we can access the tensors in the tests
-  auto vec_ptr = reinterpret_cast<std::vector<Tensor*>*>(model);
-  vec_ptr->emplace_back(input);
-  vec_ptr->emplace_back(wts);
-  if (use_bias) {
-    vec_ptr->emplace_back(bias);
-  }
-  vec_ptr->emplace_back(output);
-}
-
-Rect<4>
-Conv2D::GetWeightBounds(Realm::Processor proc)
-{
-  if ((weights.size() < 1) || (weights.size() > 2)) {
-    throw std::invalid_argument("Weight is not configured for Conv2D operator");
-  }
-  // Splitting the H, W without actually looking at the
-  // patitioning in LayerStrategy, but the positioning in terms of
-  // global processor, which is what we can control in mock LayerStrategy.
-  size_t h_stride =
-      (weights[0]->bounds[2] - (strategy->global_processors.size() - 1)) /
-      strategy->global_processors.size();
-  size_t w_stride =
-      (weights[0]->bounds[3] - (strategy->global_processors.size() - 1)) /
-      strategy->global_processors.size();
-  DomainPoint lo, hi;
-  lo.dim = 4;
-  lo[0] = 0;
-  lo[1] = 0;
-  hi.dim = 4;
-  hi[0] = weights[0]->bounds[0] - 1;
-  hi[1] = weights[0]->bounds[1] - 1;
-  for (size_t idx = 0; idx < strategy->global_processors.size(); ++idx) {
-    if (proc == strategy->global_processors[idx]) {
-      lo[2] = h_stride * idx;
-      hi[2] = h_stride * (idx + 1) - 1;
-      if (hi[2] > (weights[0]->bounds[2] - 1)) {
-        hi[2] = (weights[0]->bounds[2] - 1);
-      }
-
-      lo[3] = w_stride * idx;
-      hi[3] = w_stride * (idx + 1) - 1;
-      if (hi[3] > (weights[0]->bounds[3] - 1)) {
-        hi[3] = (weights[0]->bounds[3] - 1);
-      }
-    }
-  }
-  return Rect<4>(lo, hi);
-}
-
-Rect<1>
-Conv2D::GetBiasBounds(Realm::Processor proc)
-{
-  if (weights.size() != 2) {
-    throw std::invalid_argument("Bias is not configured for Conv2D operator");
-  }
-  // Always return the whole bias bound
-  DomainPoint lo, hi;
-  lo.dim = 1;
-  lo[0] = 0;
-  hi.dim = 1;
-  hi[0] = weights[1]->bounds[0] - 1;
-  return Rect<1>(lo, hi);
-}
-
-void
-Conv2D::Load(Realm::Processor processor)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-void
-Conv2D::initialize(
-    LegionModelInstance* instance, const unsigned instance_index,
-    Legion::Runtime* runtime, Legion::Context ctx, Legion::MapperID mapper)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-void
-Conv2D::forward(
-    LegionModelInstance* instance, const unsigned instance_index,
-    Legion::Runtime* runtime, Legion::Context ctx, Legion::MapperID mapper)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-void
-Conv2D::finalize(
-    LegionModelInstance* instance, const unsigned instance_index,
-    Legion::Runtime* runtime, Legion::Context ctx, Legion::MapperID mapper)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-void
-Conv2D::Free(Realm::Processor processor)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-
-void
-Conv2D::PreregisterTaskVariants()
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-
-}}}  // namespace triton::backend::legion
diff --git a/triton/src/test/mock/legion.cc b/triton/src/test/mock/legion.cc
deleted file mode 100644
index 70469bd9ea..0000000000
--- a/triton/src/test/mock/legion.cc
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "legion.h"
-#include <string>
-
-namespace Legion {
-
-//
-// Dummy implementation that will raise error if trying to invoke methods
-//
-const IndexSpace IndexSpace::NO_SPACE = IndexSpace();
-const FieldSpace FieldSpace::NO_SPACE = FieldSpace();
-const IndexPartition IndexPartition::NO_PART = IndexPartition();
-
-FieldSpace::FieldSpace() : id(0) {}
-IndexSpace::IndexSpace() : id(0), tid(0), type_tag(0) {}
-IndexPartition::IndexPartition() : id(0), tid(0), type_tag(0) {}
-LogicalPartition::LogicalPartition()
-    : tree_id(0), index_partition(IndexPartition::NO_PART),
-      field_space(FieldSpace::NO_SPACE)
-{
-}
-LogicalRegion::LogicalRegion()
-    : tree_id(0), index_space(IndexSpace::NO_SPACE),
-      field_space(FieldSpace::NO_SPACE)
-{
-}
-const LogicalRegion LogicalRegion::NO_REGION = LogicalRegion();
-const LogicalPartition LogicalPartition::NO_PART = LogicalPartition();
-
-Predicate::Predicate() {}
-Predicate::~Predicate() {}
-Grant::Grant() {}
-Grant::~Grant() {}
-Future::Future() {}
-Future::~Future() {}
-FutureMap::FutureMap() {}
-FutureMap::~FutureMap() {}
-ArgumentMap::ArgumentMap() {}
-ArgumentMap::~ArgumentMap() {}
-RegionRequirement::RegionRequirement() {}
-RegionRequirement::~RegionRequirement() {}
-IndexTaskLauncher::IndexTaskLauncher() {}
-ExternalResources::ExternalResources() {}
-ExternalResources::~ExternalResources() {}
-PhysicalRegion::PhysicalRegion() {}
-PhysicalRegion::~PhysicalRegion() {}
-
-}  // namespace Legion
diff --git a/triton/src/test/mock/matmul.cc b/triton/src/test/mock/matmul.cc
deleted file mode 100644
index d722b7a04e..0000000000
--- a/triton/src/test/mock/matmul.cc
+++ /dev/null
@@ -1,352 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "operators/matmul.h"
-
-using namespace Legion;
-
-namespace triton { namespace backend { namespace legion {
-
-MatMul::MatMul(
-    LegionModelState* model, const LayerStrategy* strategy, const char* name)
-    : Operator(model, strategy, OperatorType::OP_MATMUL, name, 2, 0, 1)
-{
-}
-
-template <unsigned DIM>
-void
-MatMul::compute_in1_parameters(Tensor* in1, Tensor* out)
-{
-  Rect<DIM> extent, colors;
-  Transform<DIM, DIM> transform;
-  for (int i = 0; i < DIM; i++)
-    for (int j = 0; j < DIM; j++) transform[i][j] = 0;
-  assert(out->bounds.size() >= in1->bounds.size());
-  size_t dimoff = out->bounds.size() - in1->bounds.size();
-  for (int i = 0; i < DIM; i++) {
-    extent.lo[i] = 0;
-    colors.lo[i] = 0;
-    if (i == (DIM - 1)) {
-      /* need the whole dimension */
-      extent.hi[i] = in1->bounds[i] - 1; /*inclusive*/
-      colors.hi[i] = 0;
-    } else if (in1->bounds[i] == 1) {
-      extent.hi[i] = 0;
-      colors.hi[i] = 0;
-    } else {
-      size_t pieces = strategy->dim[dimoff + 1];
-      size_t chunks = (in1->bounds[i] + pieces - 1) / pieces;
-      extent.hi[i] = chunks - 1; /*inclusive*/
-      colors.hi[i] = pieces - 1; /*inclusive*/
-    }
-  }
-  in1_transform = transform;
-  in1_extent = extent;
-  in1_colors = colors;
-}
-
-template <unsigned DIM>
-void
-MatMul::compute_in2_parameters(Tensor* in2, Tensor* out)
-{
-  Rect<DIM> extent, colors;
-  Transform<DIM, DIM> transform;
-  for (int i = 0; i < DIM; i++)
-    for (int j = 0; j < DIM; j++) transform[i][j] = 0;
-  assert(out->bounds.size() >= in2->bounds.size());
-  size_t dimoff = out->bounds.size() - in2->bounds.size();
-  for (int i = 0; i < DIM; i++) {
-    extent.lo[i] = 0;
-    colors.lo[i] = 0;
-    if (i == (DIM - 2)) {
-      /* need the whole dimension */
-      extent.hi[i] = in2->bounds[i] - 1; /*inclusive*/
-      colors.hi[i] = 0;
-    } else if (in2->bounds[i] == 1) {
-      extent.hi[i] = 0;
-      colors.hi[i] = 0;
-    } else {
-      size_t pieces = strategy->dim[dimoff + i];
-      size_t chunks = (in2->bounds[i] + pieces - 1) / pieces;
-      extent.hi[i] = chunks - 1; /*inclusive*/
-      colors.hi[i] = pieces - 1; /*inclusive*/
-    }
-  }
-  in2_transform = transform;
-  in2_extent = extent;
-  in2_colors = colors;
-}
-
-void
-MatMul::Configure(Tensor* in1, Tensor* in2, Tensor* out)
-{
-  assert(in1 != nullptr);
-  assert(in2 != nullptr);
-  assert(out != nullptr);
-  inputs.push_back(in1);
-  inputs.push_back(in2);
-  outputs.push_back(out);
-
-  if ((in1->bounds.size() == 1) && (in2->bounds.size() == 1)) {
-    fprintf(stderr, "TODO: support for dot-product in matmul operator");
-    abort();
-  } else if (in1->bounds.size() == 1) {
-    const size_t in2_dim = in2->bounds.size();
-    const size_t out_dim = out->bounds.size();
-    assert(in2_dim >= 2);
-    assert(out_dim >= 1);
-    const size_t n = out->bounds[out_dim - 1];
-    const size_t k = in1->bounds[0];
-    assert(in2->bounds[in2_dim - 2] == k);
-    assert(in2->bounds[in2_dim - 1] == n);
-    // make sure all the other dimensions align or broadcast
-    unsigned in2_broadcasts = 0;
-    for (unsigned off = 3; off <= out_dim; off++) {
-      const size_t out_size = out->bounds[out_dim - off];
-      if (off <= in2_dim) {
-        const size_t size = in2->bounds[in2_dim - off];
-        assert((size == 1) || (size == out_size));
-        if (size == 1)
-          in2_broadcasts |= (1 << (off - 3));
-      }
-    }
-
-    Rect<1> extent, colors;
-    Transform<1, 1> transform;
-    transform[0][0] = 0;
-    extent.lo[0] = 0;
-    extent.hi[0] = in1->bounds[0] - 1;  // inclusive
-    colors.lo[0] = 0;
-    colors.hi[0] = 0;
-    in1_transform = transform;
-    in1_extent = extent;
-    in1_colors = colors;
-
-    switch (in2->bounds.size()) {
-#define DIMFUNC(DIM)                       \
-  case DIM: {                              \
-    compute_in2_parameters<DIM>(in2, out); \
-    break;                                 \
-  }
-      LEGION_FOREACH_N(DIMFUNC)
-#undef DIMFUNC
-      default:
-        abort();
-    }
-  } else if (in2->bounds.size() == 1) {
-    const size_t in1_dim = in1->bounds.size();
-    const size_t out_dim = out->bounds.size();
-    assert(in1_dim >= 2);
-    const size_t m = (out_dim > 1) ? out->bounds[out_dim - 2] : 1;
-    assert(out->bounds[out_dim - 1] == 1);
-    assert(in1->bounds[in1_dim - 2] == m);
-    const size_t k = in1->bounds[in1_dim - 1];
-    assert(in2->bounds[in2->bounds[0]] == k);
-    // make sure all the other dimensions align or broadcast
-    unsigned in1_broadcasts = 0;
-    for (unsigned off = 3; off <= out_dim; off++) {
-      const size_t out_size = out->bounds[out_dim - off];
-      if (off <= in1_dim) {
-        const size_t size = in1->bounds[in1_dim - off];
-        assert((size == 1) || (size == out_size));
-        if (size == 1)
-          in1_broadcasts |= (1 << (off - 3));
-      }
-    }
-
-    switch (in1->bounds.size()) {
-#define DIMFUNC(DIM)                       \
-  case DIM: {                              \
-    compute_in1_parameters<DIM>(in1, out); \
-    break;                                 \
-  }
-      LEGION_FOREACH_N(DIMFUNC)
-#undef DIMFUNC
-      default:
-        abort();
-    }
-
-    Rect<1> extent, colors;
-    Transform<1, 1> transform;
-    transform[0][0] = 0;
-    extent.lo[0] = 0;
-    extent.hi[0] = in2->bounds[0] - 1;  // inclusive
-    colors.lo[0] = 0;
-    colors.hi[0] = 0;
-    in2_transform = transform;
-    in2_extent = extent;
-    in2_colors = colors;
-
-  } else {
-    // all tensors have at least two dimensions
-    const size_t in1_dim = in1->bounds.size();
-    const size_t in2_dim = in2->bounds.size();
-    const size_t out_dim = out->bounds.size();
-    assert(in1_dim >= 2);
-    assert(in2_dim >= 2);
-    assert(out_dim >= 2);
-    const size_t m = out->bounds[out_dim - 2];
-    const size_t n = out->bounds[out_dim - 1];
-    assert(in1->bounds[in1_dim - 2] == m);
-    const size_t k = in1->bounds[in1_dim - 1];
-    assert(in2->bounds[in2_dim - 2] == k);
-    assert(in2->bounds[in2_dim - 1] == n);
-    // make sure all the other dimensions align or can broadcast
-    unsigned in1_broadcasts = 0, in2_broadcasts = 0;
-    for (unsigned off = 3; off <= out_dim; off++) {
-      const size_t out_size = out->bounds[out_dim - off];
-      if (off <= in1_dim) {
-        const size_t size = in1->bounds[in1_dim - off];
-        assert((size == 1) || (size == out_size));
-        if (size == 1)
-          in1_broadcasts |= (1 << (off - 3));
-      }
-      if (off <= in2_dim) {
-        const size_t size = in2->bounds[in2_dim - off];
-        assert((size == 1) || (size == out_size));
-        if (size == 1)
-          in2_broadcasts |= (1 << (off - 3));
-      }
-    }
-
-    // Finally fill in the input transforms, extents, and colors for the inputs
-    switch (in1->bounds.size()) {
-#define DIMFUNC(DIM)                       \
-  case DIM: {                              \
-    compute_in1_parameters<DIM>(in1, out); \
-    break;                                 \
-  }
-      LEGION_FOREACH_N(DIMFUNC)
-#undef DIMFUNC
-      default:
-        abort();
-    }
-
-    switch (in2->bounds.size()) {
-#define DIMFUNC(DIM)                       \
-  case DIM: {                              \
-    compute_in2_parameters<DIM>(in2, out); \
-    break;                                 \
-  }
-      LEGION_FOREACH_N(DIMFUNC)
-#undef DIMFUNC
-      default:
-        abort();
-    }
-  }
-
-  // Hack so that we can access the tensors in the tests
-  auto vec_ptr = reinterpret_cast<std::vector<Tensor*>*>(model);
-  vec_ptr->emplace_back(in1);
-  vec_ptr->emplace_back(in2);
-  vec_ptr->emplace_back(out);
-}
-
-Domain
-MatMul::GetIn1Bounds(Processor proc)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-
-Domain
-MatMul::GetIn2Bounds(Processor proc)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-
-Domain
-MatMul::GetOutBounds(Processor proc)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-
-void
-MatMul::Load(Processor proc)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-
-void
-MatMul::initialize(
-    LegionModelInstance* instance, const unsigned instance_index,
-    Runtime* runtime, Context ctx, MapperID mapper)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-
-void
-MatMul::forward(
-    LegionModelInstance* instance, const unsigned instance_index,
-    Runtime* runtime, Context ctx, MapperID mapper)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-
-void
-MatMul::finalize(
-    LegionModelInstance* instance, const unsigned instance_index,
-    Runtime* runtime, Context ctx, MapperID mapper)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-
-void
-MatMul::Free(Processor proc)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-
-/*static instantiations*/
-MatMul::FunctorTable MatMul::in1_functors;
-MatMul::FunctorTable MatMul::in2_functors;
-
-/*static*/ void
-MatMul::PreregisterTaskVariants(void)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-
-/*static*/ void
-MatMul::forward_cpu(
-    const Task* task, const std::vector<PhysicalRegion>& regions, Context ctx,
-    Runtime* runtime)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-
-MatMulArgs::MatMulArgs(void) {}
-
-#ifdef LEGION_USE_CUDA
-/*static*/ void
-MatMul::forward_gpu(
-    const Task* task, const std::vector<PhysicalRegion>& regions, Context ctx,
-    Runtime* runtime)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-#endif
-
-}}}  // namespace triton::backend::legion
diff --git a/triton/src/test/mock/pool2d.cc b/triton/src/test/mock/pool2d.cc
deleted file mode 100644
index 80d5793b10..0000000000
--- a/triton/src/test/mock/pool2d.cc
+++ /dev/null
@@ -1,83 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "operators/pool2d.h"
-
-using namespace Legion;
-
-namespace triton { namespace backend { namespace legion {
-
-Pool2DArgs::Pool2DArgs() {}
-
-Pool2D::Pool2D(
-    LegionModelState* model, const LayerStrategy* strategy, int kernelH,
-    int kernelW, int strideH, int strideW, int paddingH, int paddingW,
-    PoolType type, ActivationMode act, const char* name)
-    : Operator(model, strategy, OperatorType::OP_POOL2D, name, 1, 0, 1),
-      activation(act), pool_type(type), kernel_h(kernelH), kernel_w(kernelW),
-      stride_h(strideH), stride_w(strideW), padding_h(paddingH),
-      padding_w(paddingW)
-{
-}
-
-Pool2D::~Pool2D() {}
-
-void
-Pool2D::Configure(Tensor* input, Tensor* output)
-{
-  // Hack so that we can access the tensors in the tests
-  auto vec_ptr = reinterpret_cast<std::vector<Tensor*>*>(model);
-  vec_ptr->emplace_back(input);
-  vec_ptr->emplace_back(output);
-}
-
-void
-Pool2D::Load(Realm::Processor processor)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-void
-Pool2D::initialize(
-    LegionModelInstance* instance, const unsigned instance_index,
-    Legion::Runtime* runtime, Legion::Context ctx, Legion::MapperID mapper)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-void
-Pool2D::forward(
-    LegionModelInstance* instance, const unsigned instance_index,
-    Legion::Runtime* runtime, Legion::Context ctx, Legion::MapperID mapper)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-void
-Pool2D::finalize(
-    LegionModelInstance* instance, const unsigned instance_index,
-    Legion::Runtime* runtime, Legion::Context ctx, Legion::MapperID mapper)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-void
-Pool2D::Free(Realm::Processor processor)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-
-}}}  // namespace triton::backend::legion
diff --git a/triton/src/test/mock/reshape.cc b/triton/src/test/mock/reshape.cc
deleted file mode 100644
index 3c9a5ec823..0000000000
--- a/triton/src/test/mock/reshape.cc
+++ /dev/null
@@ -1,179 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "operators/reshape.h"
-
-using namespace Legion;
-
-namespace triton { namespace backend { namespace legion {
-
-Reshape::Reshape(
-    LegionModelState* model, const LayerStrategy* strategy, const char* name)
-    : Operator(model, strategy, OperatorType::OP_RESHAPE, name, 1, 0, 1)
-{
-}
-
-void
-Reshape::Configure(Tensor* input, Tensor* output)
-{
-  assert(input != nullptr);
-  assert(output != nullptr);
-  assert(input->type == output->type);
-  // Make sure that they have the same volumes
-  size_t input_volume = 1, output_volume = 1;
-  for (unsigned idx = 0; idx < input->bounds.size(); idx++)
-    input_volume *= input->bounds[idx];
-  for (unsigned idx = 0; idx < output->bounds.size(); idx++)
-    output_volume *= output->bounds[idx];
-  assert(input_volume == output_volume);
-
-  // Group dimensions from the two input tensors together from
-  // right-to-left to find ones that can be tiles together
-  int input_idx = input->bounds.size() - 1;
-  int output_idx = output->bounds.size() - 1;
-  while ((input_idx >= 0) && (output_idx >= 0)) {
-    std::vector<int> input_dims(1, input_idx);
-    std::vector<int> output_dims(1, output_idx);
-    size_t input_tile_volume = input->bounds[input_idx--];
-    size_t output_tile_volume = output->bounds[output_idx--];
-    while (input_tile_volume != output_tile_volume) {
-      if (input_tile_volume < output_tile_volume) {
-        input_dims.push_back(input_idx);
-        input_tile_volume *= input->bounds[input_idx--];
-      } else {
-        output_dims.push_back(output_idx);
-        output_tile_volume *= output->bounds[output_idx--];
-      }
-    }
-    input_groups.emplace_back(input_dims);
-    output_groups.emplace_back(output_dims);
-  }
-  // In order to use the output launch space, we need to make sure that
-  // all but the earliest dimension in each output group has a partitioning
-  // strategy of 1 or else we won't be able to compute a partition that
-  // will allow for densely tiled copies. In the future we could fix this
-  // by computing a generalized index launch space and then mapping that
-  // onto the original output launch space or just by using affine indirect
-  // copy launchers when they are available.
-  for (unsigned g = 0; g < output_groups.size(); g++) {
-    const std::vector<int>& input_group = input_groups[g];
-    const std::vector<int>& output_group = output_groups[g];
-    for (unsigned idx = 0; idx < (output_group.size() - 1); idx++)
-      assert(strategy->dim[output_group[idx]] == 1);
-    // the size of the earliest dimension in the input group must also
-    // be divisible by the number of chunks
-    assert(
-        (input->bounds[input_group.back()] %
-         strategy->dim[output_group.back()]) == 0);
-    // the output bounds also need to be evenly divisible too or this will not
-    // work
-    assert(
-        (output->bounds[output_group.back()] %
-         strategy->dim[output_group.back()]) == 0);
-  }
-  inputs.push_back(input);
-  outputs.push_back(output);
-
-  // Hack so that we can access the tensors in the tests
-  auto vec_ptr = reinterpret_cast<std::vector<Tensor*>*>(model);
-  vec_ptr->emplace_back(input);
-  vec_ptr->emplace_back(output);
-}
-
-Domain
-Reshape::GetInputBounds(Processor proc)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-
-Domain
-Reshape::GetOutputBounds(Processor proc)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-
-void
-Reshape::Load(Processor proc)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-
-void
-Reshape::initialize(
-    LegionModelInstance* instance, const unsigned instance_index,
-    Runtime* runtime, Context ctx, MapperID mapper)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-
-void
-Reshape::forward(
-    LegionModelInstance* instance, const unsigned instance_index,
-    Runtime* runtime, Context ctx, MapperID mapper)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-
-void
-Reshape::finalize(
-    LegionModelInstance* instance, const unsigned instance_index,
-    Runtime* runtime, Context ctx, MapperID mapper)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-
-void
-Reshape::Free(Processor proc)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-
-/*static*/ void
-Reshape::PreregisterTaskVariants(void)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-
-/*static*/ void
-Reshape::forward_cpu(
-    const Task* task, const std::vector<PhysicalRegion>& regions, Context ctx,
-    Runtime* runtime)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-
-ReshapeArgs::ReshapeArgs(void) {}
-
-#ifdef LEGION_USE_CUDA
-/*static*/ void
-Reshape::forward_gpu(
-    const Task* task, const std::vector<PhysicalRegion>& regions, Context ctx,
-    Runtime* runtime)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-#endif
-
-}}}  // namespace triton::backend::legion
diff --git a/triton/src/test/mock/softmax.cc b/triton/src/test/mock/softmax.cc
deleted file mode 100644
index 75f04a5ad3..0000000000
--- a/triton/src/test/mock/softmax.cc
+++ /dev/null
@@ -1,126 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "operators/softmax.h"
-
-using namespace Legion;
-
-namespace triton { namespace backend { namespace legion {
-
-Softmax::Softmax(
-    LegionModelState* model, const LayerStrategy* strategy, unsigned dim,
-    const char* name)
-    : Operator(model, strategy, OperatorType::OP_SOFTMAX, name, 1, 0, 1),
-      dim(dim)
-{
-}
-
-void
-Softmax::Configure(Tensor* input, Tensor* output)
-{
-  assert(input != nullptr);
-  assert(output != nullptr);
-  assert(input->type == output->type);
-  // Make sure that they have the same bounds
-  assert(input->bounds.size() == output->bounds.size());
-  for (unsigned idx = 0; idx < input->bounds.size(); idx++)
-    assert(input->bounds[idx] == output->bounds[idx]);
-  inputs.push_back(input);
-  outputs.push_back(output);
-
-  // Hack so that we can access the tensors in the tests
-  auto vec_ptr = reinterpret_cast<std::vector<Tensor*>*>(model);
-  vec_ptr->emplace_back(input);
-  vec_ptr->emplace_back(output);
-}
-
-Domain
-Softmax::GetBounds(Processor proc)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-
-void
-Softmax::Load(Processor proc)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-
-void
-Softmax::initialize(
-    LegionModelInstance* instance, const unsigned instance_index,
-    Runtime* runtime, Context ctx, MapperID mapper)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-
-void
-Softmax::forward(
-    LegionModelInstance* instance, const unsigned instance_index,
-    Runtime* runtime, Context ctx, MapperID mapper)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-
-void
-Softmax::finalize(
-    LegionModelInstance* instance, const unsigned instance_index,
-    Runtime* runtime, Context ctx, MapperID mapper)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-
-void
-Softmax::Free(Processor proc)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-
-/*static*/ void
-Softmax::PreregisterTaskVariants(void)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-
-/*static*/ void
-Softmax::forward_cpu(
-    const Task* task, const std::vector<PhysicalRegion>& regions, Context ctx,
-    Runtime* runtime)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-
-SoftmaxArgs::SoftmaxArgs(void) {}
-
-#ifdef LEGION_USE_CUDA
-/*static*/ void
-Softmax::forward_gpu(
-    const Task* task, const std::vector<PhysicalRegion>& regions, Context ctx,
-    Runtime* runtime)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-#endif
-
-}}}  // namespace triton::backend::legion
diff --git a/triton/src/test/mock/strategy.cc b/triton/src/test/mock/strategy.cc
deleted file mode 100644
index 257effe9bf..0000000000
--- a/triton/src/test/mock/strategy.cc
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "strategy.h"
-#include <memory>
-
-namespace triton { namespace backend { namespace legion {
-
-//
-// Mock implementation of the class with synthetic state,
-// will raise error if trying to invoke unintended methods
-//
-LayerStrategy::LayerStrategy(
-    Legion::ShardingID sid, Legion::MappingTagID tag, Legion::Runtime* runtime)
-    : kind(Realm::Processor::Kind::LOC_PROC), sharding_function(nullptr),
-      tag(tag)
-{
-}
-
-LayerStrategy::~LayerStrategy() {}
-
-bool
-LayerStrategy::is_local_processor(Realm::Processor proc) const
-{
-  for (size_t i = 0; i < nProcs; ++i) {
-    if (local_processors[i] == proc) {
-      return true;
-    }
-  }
-  return false;
-}
-
-unsigned
-LayerStrategy::find_local_offset(Realm::Processor proc) const
-{
-  for (unsigned idx = 0; idx < nProcs; idx++)
-    if (local_processors[idx] == proc)
-      return idx;
-  throw std::invalid_argument("Getting offset for a non-local processor");
-}
-
-std::unique_ptr<LayerStrategy>
-CreateMockLayerStrategy(
-    const std::vector<Realm::Processor>& local_processors,
-    const std::vector<Realm::Processor>& global_processors)
-{
-  std::unique_ptr<LayerStrategy> ls(new LayerStrategy(0, 0, nullptr));
-  ls->nProcs = local_processors.size();
-  for (size_t i = 0; i < local_processors.size(); ++i) {
-    ls->local_processors[i] = local_processors[i];
-  }
-  ls->global_processors = global_processors;
-  return ls;
-}
-
-PartitionStrategy::~PartitionStrategy() {}
-
-}}}  // namespace triton::backend::legion
diff --git a/triton/src/test/mock/triton_error.cc b/triton/src/test/mock/triton_error.cc
deleted file mode 100644
index e90988900d..0000000000
--- a/triton/src/test/mock/triton_error.cc
+++ /dev/null
@@ -1,122 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <string>
-#include "triton/core/tritonserver.h"
-
-namespace {
-
-//
-// Duplication of TRITONSERVER_Error implementation
-//
-class TritonServerError {
- public:
-  static TRITONSERVER_Error* Create(
-      TRITONSERVER_Error_Code code, const char* msg);
-
-  static const char* CodeString(const TRITONSERVER_Error_Code code);
-  TRITONSERVER_Error_Code Code() const { return code_; }
-  const std::string& Message() const { return msg_; }
-
- private:
-  TritonServerError(TRITONSERVER_Error_Code code, const std::string& msg)
-      : code_(code), msg_(msg)
-  {
-  }
-  TritonServerError(TRITONSERVER_Error_Code code, const char* msg)
-      : code_(code), msg_(msg)
-  {
-  }
-
-  TRITONSERVER_Error_Code code_;
-  const std::string msg_;
-};
-
-TRITONSERVER_Error*
-TritonServerError::Create(TRITONSERVER_Error_Code code, const char* msg)
-{
-  return reinterpret_cast<TRITONSERVER_Error*>(
-      new TritonServerError(code, msg));
-}
-
-const char*
-TritonServerError::CodeString(const TRITONSERVER_Error_Code code)
-{
-  switch (code) {
-    case TRITONSERVER_ERROR_UNKNOWN:
-      return "Unknown";
-    case TRITONSERVER_ERROR_INTERNAL:
-      return "Internal";
-    case TRITONSERVER_ERROR_NOT_FOUND:
-      return "Not found";
-    case TRITONSERVER_ERROR_INVALID_ARG:
-      return "Invalid argument";
-    case TRITONSERVER_ERROR_UNAVAILABLE:
-      return "Unavailable";
-    case TRITONSERVER_ERROR_UNSUPPORTED:
-      return "Unsupported";
-    case TRITONSERVER_ERROR_ALREADY_EXISTS:
-      return "Already exists";
-    default:
-      break;
-  }
-
-  return "<invalid code>";
-}
-
-}  // namespace
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-TRITONSERVER_Error*
-TRITONSERVER_ErrorNew(TRITONSERVER_Error_Code code, const char* msg)
-{
-  return reinterpret_cast<TRITONSERVER_Error*>(
-      TritonServerError::Create(code, msg));
-}
-
-void
-TRITONSERVER_ErrorDelete(TRITONSERVER_Error* error)
-{
-  TritonServerError* lerror = reinterpret_cast<TritonServerError*>(error);
-  delete lerror;
-}
-
-TRITONSERVER_Error_Code
-TRITONSERVER_ErrorCode(TRITONSERVER_Error* error)
-{
-  TritonServerError* lerror = reinterpret_cast<TritonServerError*>(error);
-  return lerror->Code();
-}
-
-const char*
-TRITONSERVER_ErrorCodeString(TRITONSERVER_Error* error)
-{
-  TritonServerError* lerror = reinterpret_cast<TritonServerError*>(error);
-  return TritonServerError::CodeString(lerror->Code());
-}
-
-const char*
-TRITONSERVER_ErrorMessage(TRITONSERVER_Error* error)
-{
-  TritonServerError* lerror = reinterpret_cast<TritonServerError*>(error);
-  return lerror->Message().c_str();
-}
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/triton/src/test/mock/unary.cc b/triton/src/test/mock/unary.cc
deleted file mode 100755
index 9e560832a0..0000000000
--- a/triton/src/test/mock/unary.cc
+++ /dev/null
@@ -1,99 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "operators/unary.h"
-
-using namespace Legion;
-
-namespace triton { namespace backend { namespace legion {
-
-UnaryArgs::UnaryArgs() {}
-
-UnaryOperator::UnaryOperator(
-    LegionModelState* model, const LayerStrategy* strategy, OperatorType type,
-    const void* scalar, DataType scalar_type, bool inplace, const char* name)
-    : Operator(
-          model, strategy, type, name, 1 /*inputs*/, 0 /*weights*/,
-          1 /*outputs*/),
-      scalar_type(scalar_type), inplace(inplace)
-{
-}
-
-UnaryOperator::~UnaryOperator() {}
-
-void
-UnaryOperator::Configure(Tensor* input, Tensor* output)
-{
-  assert(input != nullptr);
-  assert(output != nullptr);
-  assert(input->type == scalar_type);
-  assert((op_type == OP_CAST) || (input->type == output->type));
-  assert(!inplace || (input == output));
-  // Make sure that they have the same bounds
-  assert(input->bounds.size() == output->bounds.size());
-  for (unsigned idx = 0; idx < input->bounds.size(); idx++)
-    assert(input->bounds[idx] == output->bounds[idx]);
-
-  // Hack so that we can access the tensors in the tests
-  auto vec_ptr = reinterpret_cast<std::vector<Tensor*>*>(model);
-  vec_ptr->emplace_back(input);
-  vec_ptr->emplace_back(output);
-}
-
-void
-UnaryOperator::Load(Realm::Processor processor)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-void
-UnaryOperator::initialize(
-    LegionModelInstance* instance, const unsigned instance_index,
-    Legion::Runtime* runtime, Legion::Context ctx, Legion::MapperID mapper)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-void
-UnaryOperator::forward(
-    LegionModelInstance* instance, const unsigned instance_index,
-    Legion::Runtime* runtime, Legion::Context ctx, Legion::MapperID mapper)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-void
-UnaryOperator::finalize(
-    LegionModelInstance* instance, const unsigned instance_index,
-    Legion::Runtime* runtime, Legion::Context ctx, Legion::MapperID mapper)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-void
-UnaryOperator::Free(Realm::Processor processor)
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-
-void
-UnaryOperator::PreregisterTaskVariants()
-{
-  throw std::invalid_argument(
-      "This function shouldn't be called in parser unit test");
-}
-
-}}}  // namespace triton::backend::legion
diff --git a/triton/src/test/onnx_parser_test.cc b/triton/src/test/onnx_parser_test.cc
deleted file mode 100644
index 64c8f6bf29..0000000000
--- a/triton/src/test/onnx_parser_test.cc
+++ /dev/null
@@ -1,967 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "onnx_parser.h"
-#include "operators/binary.h"
-#include "operators/conv2d.h"
-#include "operators/pool2d.h"
-#include "operators/softmax.h"
-#include "operators/unary.h"
-
-namespace {
-
-namespace tbl = triton::backend::legion;
-
-#define CHECK_GENERAL_OPERATOR_ATTRIBUTES(                                \
-    op__, op_type__, model__, strategy__, num_input__, num_weight__,      \
-    num_output__)                                                         \
-  do {                                                                    \
-    EXPECT_EQ(op__->op_type, op_type__);                                  \
-    EXPECT_TRUE(                                                          \
-        op__->model == reinterpret_cast<tbl::LegionModelState*>(model__)) \
-        << "Expect: " << model__ << "; Got: " << op__->model;             \
-    EXPECT_TRUE(op__->strategy == strategy__)                             \
-        << "Expect: " << strategy__ << "; Got: " << op__->strategy;       \
-    EXPECT_EQ(op__->num_inputs, num_input__);                             \
-    EXPECT_EQ(op__->num_weights, num_weight__);                           \
-    EXPECT_EQ(op__->num_outputs, num_output__);                           \
-  } while (false)
-
-#define CHECK_GENERAL_TENSOR_ATTRIBUTES(                                     \
-    t__, owner__, is_weight__, dtype__, dims_vec__)                          \
-  do {                                                                       \
-    if (is_weight__) {                                                       \
-      EXPECT_TRUE(dynamic_cast<tbl::Weights*>(t__) != nullptr)               \
-          << "Expect tensor to be a Weights instance";                       \
-    } else {                                                                 \
-      EXPECT_FALSE(dynamic_cast<tbl::Weights*>(t__) != nullptr)              \
-          << "Expect tensor not to be a Weights instance";                   \
-    }                                                                        \
-    EXPECT_TRUE(t__->owner == owner__)                                       \
-        << ((owner__ == nullptr) ? "Expect tensor not owned by the operator" \
-                                 : "Expect tensor owned by the operator");   \
-    EXPECT_EQ(t__->type, dtype__);                                           \
-    EXPECT_EQ(t__->bounds, dims_vec__);                                      \
-  } while (false)
-
-class OnnxParserSingleNodeSingleProcessorTest : public ::testing::Test {
- public:
-  OnnxParserSingleNodeSingleProcessorTest()
-      : layer_strategy_(0, 0, nullptr), local_cpus_(1)
-  {
-    // Set up layer strategy and function for finding local processor
-    // (LegionTritonRuntime::FindLocalProcessors).
-    // For tests in this fixture there should be only one CPU processor
-    // which is "local" to the machine.
-    // Note that the layer strategy doesn't specify dims as it depends on
-    // the operator it describes, the test should set it properly or have
-    // a fake operator implementation that is indenpendent to LayerStrategy::dim
-    local_cpus_[0].id = 1;
-    find_local_processor_fn_ = [this](Realm::Processor::Kind kind)
-        -> const std::vector<Realm::Processor>& {
-      switch (kind) {
-        case Realm::Processor::LOC_PROC:
-          return local_cpus_;
-        case Realm::Processor::TOC_PROC:
-          return local_cpus_;
-        default:
-          throw std::invalid_argument("Unknown processor kind");
-      }
-      return local_cpus_;
-    };
-    layer_strategy_.kind = Realm::Processor::LOC_PROC;
-    layer_strategy_.nProcs = 1;
-    layer_strategy_.local_processors[0] = local_cpus_[0];
-    layer_strategy_.global_processors.push_back(local_cpus_[0]);
-  }
-
-  tbl::LayerStrategy layer_strategy_;
-  std::function<const std::vector<Realm::Processor>&(Realm::Processor::Kind)>
-      find_local_processor_fn_;
-  std::vector<Realm::Processor> local_gpus_;
-  std::vector<Realm::Processor> local_cpus_;
-};
-
-TEST_F(OnnxParserSingleNodeSingleProcessorTest, ParseAdd)
-{
-  std::vector<tbl::Tensor*> model_stub;
-  std::vector<std::pair<std::string, tbl::Tensor*>> inputs;
-  std::vector<std::pair<std::string, tbl::Tensor*>> outputs;
-  std::vector<tbl::Operator*> layers;
-  tbl::PartitionStrategy strategy(
-      reinterpret_cast<tbl::LegionModelState*>(&model_stub),
-      {reinterpret_cast<const tbl::LayerStrategy*>(&layer_strategy_)});
-
-  auto err = tbl::OnnxParser::LoadModel(
-      find_local_processor_fn_,
-      reinterpret_cast<tbl::LegionModelState*>(&model_stub), &strategy,
-      "data/add.onnx", &inputs, &outputs, &layers);
-  ASSERT_TRUE(err == nullptr) << TRITONSERVER_ErrorMessage(err);
-
-  ASSERT_EQ(model_stub.size(), 3) << "Expect 3 tensors are parsed";
-  ASSERT_EQ(layers.size(), 1) << "Expect 1 layer is parsed";
-
-  auto generated_op = dynamic_cast<tbl::BinaryOperator*>(layers[0]);
-  ASSERT_TRUE(generated_op != nullptr)
-      << "Expect the operator to be a Binary instance";
-
-  CHECK_GENERAL_OPERATOR_ATTRIBUTES(
-      generated_op, tbl::OperatorType::OP_EW_ADD, &model_stub, &layer_strategy_,
-      2, 0, 1);
-
-  ASSERT_EQ(inputs.size(), 2) << "Expect 2 inputs are parsed";
-  for (size_t i = 0; i <= 1; i++) {
-    ASSERT_TRUE(inputs[i].second == model_stub[i]);
-    CHECK_GENERAL_TENSOR_ATTRIBUTES(
-        inputs[i].second, nullptr, false, tbl::DataType::DT_FLOAT,
-        std::vector<size_t>({4, 2}));
-  }
-
-  auto output = model_stub[2];
-  CHECK_GENERAL_TENSOR_ATTRIBUTES(
-      output, generated_op, false, tbl::DataType::DT_FLOAT,
-      std::vector<size_t>({4, 2}));
-}
-
-TEST_F(OnnxParserSingleNodeSingleProcessorTest, ParseSub)
-{
-  std::vector<tbl::Tensor*> model_stub;
-  std::vector<std::pair<std::string, tbl::Tensor*>> inputs;
-  std::vector<std::pair<std::string, tbl::Tensor*>> outputs;
-  std::vector<tbl::Operator*> layers;
-  tbl::PartitionStrategy strategy(
-      reinterpret_cast<tbl::LegionModelState*>(&model_stub),
-      {reinterpret_cast<const tbl::LayerStrategy*>(&layer_strategy_)});
-
-  auto err = tbl::OnnxParser::LoadModel(
-      find_local_processor_fn_,
-      reinterpret_cast<tbl::LegionModelState*>(&model_stub), &strategy,
-      "data/sub.onnx", &inputs, &outputs, &layers);
-  ASSERT_TRUE(err == nullptr) << TRITONSERVER_ErrorMessage(err);
-
-  ASSERT_EQ(model_stub.size(), 3) << "Expect 3 tensors are parsed";
-  ASSERT_EQ(layers.size(), 1) << "Expect 1 layer is parsed";
-
-  auto generated_op = dynamic_cast<tbl::BinaryOperator*>(layers[0]);
-  ASSERT_TRUE(generated_op != nullptr)
-      << "Expect the operator to be a Binary instance";
-
-  CHECK_GENERAL_OPERATOR_ATTRIBUTES(
-      generated_op, tbl::OperatorType::OP_EW_SUB, &model_stub, &layer_strategy_,
-      2, 0, 1);
-
-  ASSERT_EQ(inputs.size(), 2) << "Expect 2 inputs are parsed";
-  for (size_t i = 0; i <= 1; i++) {
-    ASSERT_TRUE(inputs[i].second == model_stub[i]);
-    CHECK_GENERAL_TENSOR_ATTRIBUTES(
-        inputs[i].second, nullptr, false, tbl::DataType::DT_FLOAT,
-        std::vector<size_t>({4, 2}));
-  }
-
-  auto output = model_stub[2];
-  CHECK_GENERAL_TENSOR_ATTRIBUTES(
-      output, generated_op, false, tbl::DataType::DT_FLOAT,
-      std::vector<size_t>({4, 2}));
-}
-
-TEST_F(OnnxParserSingleNodeSingleProcessorTest, ParseMul)
-{
-  std::vector<tbl::Tensor*> model_stub;
-  std::vector<std::pair<std::string, tbl::Tensor*>> inputs;
-  std::vector<std::pair<std::string, tbl::Tensor*>> outputs;
-  std::vector<tbl::Operator*> layers;
-  tbl::PartitionStrategy strategy(
-      reinterpret_cast<tbl::LegionModelState*>(&model_stub),
-      {reinterpret_cast<const tbl::LayerStrategy*>(&layer_strategy_)});
-
-  auto err = tbl::OnnxParser::LoadModel(
-      find_local_processor_fn_,
-      reinterpret_cast<tbl::LegionModelState*>(&model_stub), &strategy,
-      "data/mul.onnx", &inputs, &outputs, &layers);
-  ASSERT_TRUE(err == nullptr) << TRITONSERVER_ErrorMessage(err);
-
-  ASSERT_EQ(model_stub.size(), 3) << "Expect 3 tensors are parsed";
-  ASSERT_EQ(layers.size(), 1) << "Expect 1 layer is parsed";
-
-  auto generated_op = dynamic_cast<tbl::BinaryOperator*>(layers[0]);
-  ASSERT_TRUE(generated_op != nullptr)
-      << "Expect the operator to be a Binary instance";
-
-  CHECK_GENERAL_OPERATOR_ATTRIBUTES(
-      generated_op, tbl::OperatorType::OP_EW_MUL, &model_stub, &layer_strategy_,
-      2, 0, 1);
-
-  ASSERT_EQ(inputs.size(), 2) << "Expect 2 inputs are parsed";
-  for (size_t i = 0; i <= 1; i++) {
-    ASSERT_TRUE(inputs[i].second == model_stub[i]);
-    CHECK_GENERAL_TENSOR_ATTRIBUTES(
-        inputs[i].second, nullptr, false, tbl::DataType::DT_FLOAT,
-        std::vector<size_t>({4, 2}));
-  }
-
-  auto output = model_stub[2];
-  CHECK_GENERAL_TENSOR_ATTRIBUTES(
-      output, generated_op, false, tbl::DataType::DT_FLOAT,
-      std::vector<size_t>({4, 2}));
-}
-
-TEST_F(OnnxParserSingleNodeSingleProcessorTest, ParseAvgPool)
-{
-  std::vector<tbl::Tensor*> model_stub;
-  tbl::PartitionStrategy strategy(
-      reinterpret_cast<tbl::LegionModelState*>(&model_stub),
-      {reinterpret_cast<const tbl::LayerStrategy*>(&layer_strategy_)});
-  std::vector<std::pair<std::string, tbl::Tensor*>> inputs;
-  std::vector<std::pair<std::string, tbl::Tensor*>> outputs;
-  std::vector<tbl::Operator*> layers;
-
-  auto err = tbl::OnnxParser::LoadModel(
-      find_local_processor_fn_,
-      reinterpret_cast<tbl::LegionModelState*>(&model_stub), &strategy,
-      "data/avg_pool.onnx", &inputs, &outputs, &layers);
-  ASSERT_TRUE(err == nullptr) << TRITONSERVER_ErrorMessage(err);
-
-  ASSERT_EQ(model_stub.size(), 2) << "Expect 2 tensors are parsed";
-  ASSERT_EQ(layers.size(), 1) << "Expect 1 layer is parsed";
-  auto generated_op = dynamic_cast<tbl::Pool2D*>(layers[0]);
-  ASSERT_TRUE(generated_op != nullptr)
-      << "Expect the operator to be a Pool2D instance";
-
-  CHECK_GENERAL_OPERATOR_ATTRIBUTES(
-      generated_op, tbl::OperatorType::OP_POOL2D, &model_stub, &layer_strategy_,
-      1, 0, 1);
-  EXPECT_EQ(generated_op->activation, tbl::ActivationMode::AC_MODE_NONE);
-  EXPECT_EQ(generated_op->pool_type, tbl::PoolType::POOL_AVG);
-  EXPECT_EQ(generated_op->kernel_h, 2);
-  EXPECT_EQ(generated_op->kernel_w, 2);
-  EXPECT_EQ(generated_op->stride_h, 1);
-  EXPECT_EQ(generated_op->stride_w, 1);
-  EXPECT_EQ(generated_op->padding_h, 0);
-  EXPECT_EQ(generated_op->padding_w, 0);
-
-  // Check associated tensors
-  ASSERT_EQ(inputs.size(), 1) << "Expect 1 input is parsed";
-  ASSERT_TRUE(inputs[0].second == model_stub[0]);
-  CHECK_GENERAL_TENSOR_ATTRIBUTES(
-      inputs[0].second, nullptr, false, tbl::DataType::DT_FLOAT,
-      std::vector<size_t>({1, 3, 30, 30}));
-
-  auto output = model_stub[1];
-  CHECK_GENERAL_TENSOR_ATTRIBUTES(
-      output, generated_op, false, tbl::DataType::DT_FLOAT,
-      std::vector<size_t>({1, 3, 29, 29}));
-}
-
-TEST_F(OnnxParserSingleNodeSingleProcessorTest, ParseAvgPoolAutoPad)
-{
-  std::vector<tbl::Tensor*> model_stub;
-  tbl::PartitionStrategy strategy(
-      reinterpret_cast<tbl::LegionModelState*>(&model_stub),
-      {reinterpret_cast<const tbl::LayerStrategy*>(&layer_strategy_)});
-  std::vector<std::pair<std::string, tbl::Tensor*>> inputs;
-  std::vector<std::pair<std::string, tbl::Tensor*>> outputs;
-  std::vector<tbl::Operator*> layers;
-
-  auto err = tbl::OnnxParser::LoadModel(
-      find_local_processor_fn_,
-      reinterpret_cast<tbl::LegionModelState*>(&model_stub), &strategy,
-      "data/avg_pool_autopad.onnx", &inputs, &outputs, &layers);
-  auto expected_err = std::string(
-      "Unsupported attribute value 'SAME_LOWER' for attribute 'auto_pad' in "
-      "'AveragePool' layer named '', currently supported value is 'NOTSET'");
-  ASSERT_TRUE(err != nullptr) << "Unexpected successful model load";
-  ASSERT_TRUE(TRITONSERVER_ERROR_UNSUPPORTED == TRITONSERVER_ErrorCode(err))
-      << "Wrong error type" << std::endl
-      << "Actual error type: " << TRITONSERVER_ErrorCodeString(err) << std::endl
-      << "Expected error type: "
-      << "Unsupported";
-  ASSERT_TRUE(expected_err == TRITONSERVER_ErrorMessage(err))
-      << "Wrong error message" << std::endl
-      << "Actual error message: " << TRITONSERVER_ErrorMessage(err) << std::endl
-      << "Expected error message: " << expected_err;
-}
-
-TEST_F(OnnxParserSingleNodeSingleProcessorTest, ParseAvgPoolCeil)
-{
-  std::vector<tbl::Tensor*> model_stub;
-  tbl::PartitionStrategy strategy(
-      reinterpret_cast<tbl::LegionModelState*>(&model_stub),
-      {reinterpret_cast<const tbl::LayerStrategy*>(&layer_strategy_)});
-  std::vector<std::pair<std::string, tbl::Tensor*>> inputs;
-  std::vector<std::pair<std::string, tbl::Tensor*>> outputs;
-  std::vector<tbl::Operator*> layers;
-
-  auto err = tbl::OnnxParser::LoadModel(
-      find_local_processor_fn_,
-      reinterpret_cast<tbl::LegionModelState*>(&model_stub), &strategy,
-      "data/avg_pool_ceil.onnx", &inputs, &outputs, &layers);
-  auto expected_err = std::string(
-      "Unsupported attribute value for attribute 'ceil_mode' in 'AveragePool' "
-      "layer named '', currently supported value is 0");
-  ASSERT_TRUE(err != nullptr) << "Unexpected successful model load";
-  ASSERT_TRUE(TRITONSERVER_ERROR_UNSUPPORTED == TRITONSERVER_ErrorCode(err))
-      << "Wrong error type" << std::endl
-      << "Actual error type: " << TRITONSERVER_ErrorCodeString(err) << std::endl
-      << "Expected error type: "
-      << "Unsupported";
-  ASSERT_TRUE(expected_err == TRITONSERVER_ErrorMessage(err))
-      << "Wrong error message" << std::endl
-      << "Actual error message: " << TRITONSERVER_ErrorMessage(err) << std::endl
-      << "Expected error message: " << expected_err;
-}
-
-TEST_F(OnnxParserSingleNodeSingleProcessorTest, ParseAvgPoolCountIncludePad)
-{
-  std::vector<tbl::Tensor*> model_stub;
-  tbl::PartitionStrategy strategy(
-      reinterpret_cast<tbl::LegionModelState*>(&model_stub),
-      {reinterpret_cast<const tbl::LayerStrategy*>(&layer_strategy_)});
-  std::vector<std::pair<std::string, tbl::Tensor*>> inputs;
-  std::vector<std::pair<std::string, tbl::Tensor*>> outputs;
-  std::vector<tbl::Operator*> layers;
-
-  auto err = tbl::OnnxParser::LoadModel(
-      find_local_processor_fn_,
-      reinterpret_cast<tbl::LegionModelState*>(&model_stub), &strategy,
-      "data/avg_pool_count_include_pad.onnx", &inputs, &outputs, &layers);
-  auto expected_err = std::string(
-      "Unsupported attribute value for attribute 'count_include_pad' in "
-      "'AveragePool' layer named '', currently supported value is 0");
-  ASSERT_TRUE(err != nullptr) << "Unexpected successful model load";
-  ASSERT_TRUE(TRITONSERVER_ERROR_UNSUPPORTED == TRITONSERVER_ErrorCode(err))
-      << "Wrong error type" << std::endl
-      << "Actual error type: " << TRITONSERVER_ErrorCodeString(err) << std::endl
-      << "Expected error type: "
-      << "Unsupported";
-  ASSERT_TRUE(expected_err == TRITONSERVER_ErrorMessage(err))
-      << "Wrong error message" << std::endl
-      << "Actual error message: " << TRITONSERVER_ErrorMessage(err) << std::endl
-      << "Expected error message: " << expected_err;
-}
-
-TEST_F(OnnxParserSingleNodeSingleProcessorTest, ParseAvgPoolPad)
-{
-  std::vector<tbl::Tensor*> model_stub;
-  tbl::PartitionStrategy strategy(
-      reinterpret_cast<tbl::LegionModelState*>(&model_stub),
-      {reinterpret_cast<const tbl::LayerStrategy*>(&layer_strategy_)});
-  std::vector<std::pair<std::string, tbl::Tensor*>> inputs;
-  std::vector<std::pair<std::string, tbl::Tensor*>> outputs;
-  std::vector<tbl::Operator*> layers;
-
-  auto err = tbl::OnnxParser::LoadModel(
-      find_local_processor_fn_,
-      reinterpret_cast<tbl::LegionModelState*>(&model_stub), &strategy,
-      "data/avg_pool_pad.onnx", &inputs, &outputs, &layers);
-  ASSERT_TRUE(err == nullptr) << TRITONSERVER_ErrorMessage(err);
-
-  ASSERT_EQ(model_stub.size(), 2) << "Expect 2 tensors are parsed";
-  ASSERT_EQ(layers.size(), 1) << "Expect 1 layer is parsed";
-  auto generated_op = dynamic_cast<tbl::Pool2D*>(layers[0]);
-  ASSERT_TRUE(generated_op != nullptr)
-      << "Expect the operator to be a Pool2D instance";
-
-  CHECK_GENERAL_OPERATOR_ATTRIBUTES(
-      generated_op, tbl::OperatorType::OP_POOL2D, &model_stub, &layer_strategy_,
-      1, 0, 1);
-  EXPECT_EQ(generated_op->activation, tbl::ActivationMode::AC_MODE_NONE);
-  EXPECT_EQ(generated_op->pool_type, tbl::PoolType::POOL_AVG);
-  EXPECT_EQ(generated_op->kernel_h, 2);
-  EXPECT_EQ(generated_op->kernel_w, 2);
-  EXPECT_EQ(generated_op->stride_h, 3);
-  EXPECT_EQ(generated_op->stride_w, 3);
-  EXPECT_EQ(generated_op->padding_h, 1);
-  EXPECT_EQ(generated_op->padding_w, 1);
-
-  // Check associated tensors
-  ASSERT_EQ(inputs.size(), 1) << "Expect 1 input is parsed";
-  ASSERT_TRUE(inputs[0].second == model_stub[0]);
-  CHECK_GENERAL_TENSOR_ATTRIBUTES(
-      inputs[0].second, nullptr, false, tbl::DataType::DT_FLOAT,
-      std::vector<size_t>({1, 3, 30, 30}));
-
-  auto output = model_stub[1];
-  CHECK_GENERAL_TENSOR_ATTRIBUTES(
-      output, generated_op, false, tbl::DataType::DT_FLOAT,
-      std::vector<size_t>({1, 3, 11, 11}));
-}
-
-TEST_F(OnnxParserSingleNodeSingleProcessorTest, ParseCast)
-{
-  std::vector<tbl::Tensor*> model_stub;
-  tbl::PartitionStrategy strategy(
-      reinterpret_cast<tbl::LegionModelState*>(&model_stub),
-      {reinterpret_cast<const tbl::LayerStrategy*>(&layer_strategy_)});
-  std::vector<std::pair<std::string, tbl::Tensor*>> inputs;
-  std::vector<std::pair<std::string, tbl::Tensor*>> outputs;
-  std::vector<tbl::Operator*> layers;
-
-  auto err = tbl::OnnxParser::LoadModel(
-      find_local_processor_fn_,
-      reinterpret_cast<tbl::LegionModelState*>(&model_stub), &strategy,
-      "data/cast.onnx", &inputs, &outputs, &layers);
-  ASSERT_TRUE(err == nullptr) << TRITONSERVER_ErrorMessage(err);
-
-  ASSERT_EQ(model_stub.size(), 2) << "Expect 2 tensors are parsed";
-  ASSERT_EQ(layers.size(), 1) << "Expect 1 layer is parsed";
-  auto generated_op = dynamic_cast<tbl::UnaryOperator*>(layers[0]);
-  ASSERT_TRUE(generated_op != nullptr)
-      << "Expect the operator to be a UnaryOperator instance";
-
-  CHECK_GENERAL_OPERATOR_ATTRIBUTES(
-      generated_op, tbl::OperatorType::OP_CAST, &model_stub, &layer_strategy_,
-      1, 0, 1);
-  EXPECT_EQ(generated_op->scalar_type, tbl::DataType::DT_FLOAT);
-  EXPECT_EQ(generated_op->inplace, false);
-
-  // Check associated tensors
-  ASSERT_EQ(inputs.size(), 1) << "Expect 1 input is parsed";
-  ASSERT_TRUE(inputs[0].second == model_stub[0]);
-  CHECK_GENERAL_TENSOR_ATTRIBUTES(
-      inputs[0].second, nullptr, false, tbl::DataType::DT_FLOAT,
-      std::vector<size_t>({1, 3}));
-  auto output = model_stub[1];
-  CHECK_GENERAL_TENSOR_ATTRIBUTES(
-      output, generated_op, false, tbl::DataType::DT_DOUBLE,
-      std::vector<size_t>({1, 3}));
-}
-
-TEST_F(OnnxParserSingleNodeSingleProcessorTest, ParseConv2D)
-{
-  // Data section
-  std::vector<float> bias_data = {0, 1};
-  std::vector<std::vector<std::vector<std::vector<float>>>> weight_data = {
-      {{{0, 1, 2}, {3, 4, 5}, {6, 7, 8}}},
-      {{{9, 10, 11}, {12, 13, 14}, {15, 16, 17}}}};
-  std::vector<tbl::Tensor*> model_stub;
-  tbl::PartitionStrategy strategy(
-      reinterpret_cast<tbl::LegionModelState*>(&model_stub),
-      {reinterpret_cast<const tbl::LayerStrategy*>(&layer_strategy_)});
-  std::vector<std::pair<std::string, tbl::Tensor*>> inputs;
-  std::vector<std::pair<std::string, tbl::Tensor*>> outputs;
-  std::vector<tbl::Operator*> layers;
-
-  auto err = tbl::OnnxParser::LoadModel(
-      find_local_processor_fn_,
-      reinterpret_cast<tbl::LegionModelState*>(&model_stub), &strategy,
-      "data/conv2d_with_bias.onnx", &inputs, &outputs, &layers);
-  ASSERT_TRUE(err == nullptr) << TRITONSERVER_ErrorMessage(err);
-
-  ASSERT_EQ(model_stub.size(), 4) << "Expect 4 tensors are parsed";
-  ASSERT_EQ(layers.size(), 1) << "Expect 1 layer is parsed";
-  auto generated_op = dynamic_cast<tbl::Conv2D*>(layers[0]);
-  ASSERT_TRUE(generated_op != nullptr)
-      << "Expect the operator to be a Conv2D instance";
-
-  CHECK_GENERAL_OPERATOR_ATTRIBUTES(
-      generated_op, tbl::OperatorType::OP_CONV2D, &model_stub, &layer_strategy_,
-      1, 2, 1);
-  // [gluo FIXME] expected value are set based on knowledge of the model file,
-  // should add sanity check before running test that the files are intact
-  // Check op specific attributes
-  EXPECT_EQ(generated_op->activation, tbl::ActivationMode::AC_MODE_NONE);
-  EXPECT_EQ(generated_op->in_channels, 1);
-  EXPECT_EQ(generated_op->out_channels, 2);
-  EXPECT_EQ(generated_op->kernel_h, 3);
-  EXPECT_EQ(generated_op->kernel_w, 3);
-  EXPECT_EQ(generated_op->stride_h, 1);
-  EXPECT_EQ(generated_op->stride_w, 1);
-  EXPECT_EQ(generated_op->padding_h, 0);
-  EXPECT_EQ(generated_op->padding_w, 0);
-  EXPECT_EQ(generated_op->groups, 1);
-  EXPECT_EQ(generated_op->use_bias, true);
-
-  // Check associated tensors
-  ASSERT_EQ(inputs.size(), 1) << "Expect 1 input is parsed";
-  ASSERT_TRUE(inputs[0].second == model_stub[0]);
-  CHECK_GENERAL_TENSOR_ATTRIBUTES(
-      inputs[0].second, nullptr, false, tbl::DataType::DT_FLOAT,
-      std::vector<size_t>({4, 1, 5, 5}));
-
-  {
-    auto weight = model_stub[1];
-    CHECK_GENERAL_TENSOR_ATTRIBUTES(
-        weight, generated_op, true, tbl::DataType::DT_FLOAT,
-        std::vector<size_t>({2, 1, 3, 3}));
-    auto bound =
-        generated_op->GetWeightBounds(layer_strategy_.local_processors[0]);
-    const float* data_allocation = reinterpret_cast<const float*>(
-        dynamic_cast<tbl::Weights*>(weight)->local_allocation[0]);
-    for (size_t oc = bound.lo[0]; oc <= bound.hi[0]; ++oc) {
-      for (size_t ic = bound.lo[1]; ic <= bound.hi[1]; ++ic) {
-        for (size_t kh = bound.lo[2]; kh <= bound.hi[2]; ++kh) {
-          for (size_t kw = bound.lo[3]; kw <= bound.hi[3]; ++kw) {
-            EXPECT_EQ(weight_data[oc][ic][kh][kw], *data_allocation)
-                << "Mismatched value at weight entry (" << oc << ", " << ic
-                << ", " << kh << ", " << kw << ")";
-            ++data_allocation;
-          }
-        }
-      }
-    }
-  }
-
-  {
-    auto bias = model_stub[2];
-    CHECK_GENERAL_TENSOR_ATTRIBUTES(
-        bias, generated_op, true, tbl::DataType::DT_FLOAT,
-        std::vector<size_t>({2}));
-    auto bound =
-        generated_op->GetBiasBounds(layer_strategy_.local_processors[0]);
-    const float* data_allocation = reinterpret_cast<const float*>(
-        dynamic_cast<tbl::Weights*>(bias)->local_allocation[0]);
-    for (size_t idx = bound.lo[0]; idx <= bound.hi[0]; ++idx) {
-      EXPECT_EQ(bias_data[idx], *data_allocation)
-          << "Mismatched value at weight entry (" << idx << ")";
-      ++data_allocation;
-    }
-  }
-
-  auto output = model_stub[3];
-  CHECK_GENERAL_TENSOR_ATTRIBUTES(
-      output, generated_op, false, tbl::DataType::DT_FLOAT,
-      std::vector<size_t>({4, 2, 3, 3}));
-}
-
-TEST_F(OnnxParserSingleNodeSingleProcessorTest, ParseIdentity)
-{
-  std::vector<tbl::Tensor*> model_stub;
-  tbl::PartitionStrategy strategy(
-      reinterpret_cast<tbl::LegionModelState*>(&model_stub),
-      {reinterpret_cast<const tbl::LayerStrategy*>(&layer_strategy_)});
-  std::vector<std::pair<std::string, tbl::Tensor*>> inputs;
-  std::vector<std::pair<std::string, tbl::Tensor*>> outputs;
-  std::vector<tbl::Operator*> layers;
-
-  auto err = tbl::OnnxParser::LoadModel(
-      find_local_processor_fn_,
-      reinterpret_cast<tbl::LegionModelState*>(&model_stub), &strategy,
-      "data/identity.onnx", &inputs, &outputs, &layers);
-  ASSERT_TRUE(err == nullptr) << TRITONSERVER_ErrorMessage(err);
-
-  ASSERT_EQ(model_stub.size(), 2) << "Expect 2 tensors are parsed";
-  ASSERT_EQ(layers.size(), 1) << "Expect 1 layer is parsed";
-  auto generated_op = dynamic_cast<tbl::UnaryOperator*>(layers[0]);
-  ASSERT_TRUE(generated_op != nullptr)
-      << "Expect the operator to be a UnaryOperator instance";
-
-  CHECK_GENERAL_OPERATOR_ATTRIBUTES(
-      generated_op, tbl::OperatorType::OP_IDENTITY, &model_stub,
-      &layer_strategy_, 1, 0, 1);
-  // [gluo FIXME] expected value are set based on knowledge of the model file,
-  // should add sanity check before running test that the files are intact
-  // Check op specific attributes
-  EXPECT_EQ(generated_op->scalar_type, tbl::DataType::DT_FLOAT);
-  EXPECT_EQ(generated_op->inplace, false);
-
-  // Check associated tensors
-  ASSERT_EQ(inputs.size(), 1) << "Expect 1 input is parsed";
-  ASSERT_TRUE(inputs[0].second == model_stub[0]);
-  CHECK_GENERAL_TENSOR_ATTRIBUTES(
-      inputs[0].second, nullptr, false, tbl::DataType::DT_FLOAT,
-      std::vector<size_t>({4, 1, 5, 5}));
-  auto output = model_stub[1];
-  CHECK_GENERAL_TENSOR_ATTRIBUTES(
-      output, generated_op, false, tbl::DataType::DT_FLOAT,
-      std::vector<size_t>({4, 1, 5, 5}));
-}
-
-TEST_F(OnnxParserSingleNodeSingleProcessorTest, ParseMaxPool)
-{
-  std::vector<tbl::Tensor*> model_stub;
-  tbl::PartitionStrategy strategy(
-      reinterpret_cast<tbl::LegionModelState*>(&model_stub),
-      {reinterpret_cast<const tbl::LayerStrategy*>(&layer_strategy_)});
-  std::vector<std::pair<std::string, tbl::Tensor*>> inputs;
-  std::vector<std::pair<std::string, tbl::Tensor*>> outputs;
-  std::vector<tbl::Operator*> layers;
-
-  auto err = tbl::OnnxParser::LoadModel(
-      find_local_processor_fn_,
-      reinterpret_cast<tbl::LegionModelState*>(&model_stub), &strategy,
-      "data/max_pool.onnx", &inputs, &outputs, &layers);
-  ASSERT_TRUE(err == nullptr) << TRITONSERVER_ErrorMessage(err);
-
-  ASSERT_EQ(model_stub.size(), 2) << "Expect 2 tensors are parsed";
-  ASSERT_EQ(layers.size(), 1) << "Expect 1 layer is parsed";
-  auto generated_op = dynamic_cast<tbl::Pool2D*>(layers[0]);
-  ASSERT_TRUE(generated_op != nullptr)
-      << "Expect the operator to be a Pool2D instance";
-
-  CHECK_GENERAL_OPERATOR_ATTRIBUTES(
-      generated_op, tbl::OperatorType::OP_POOL2D, &model_stub, &layer_strategy_,
-      1, 0, 1);
-  EXPECT_EQ(generated_op->activation, tbl::ActivationMode::AC_MODE_NONE);
-  EXPECT_EQ(generated_op->pool_type, tbl::PoolType::POOL_MAX);
-  EXPECT_EQ(generated_op->kernel_h, 5);
-  EXPECT_EQ(generated_op->kernel_w, 5);
-  EXPECT_EQ(generated_op->stride_h, 2);
-  EXPECT_EQ(generated_op->stride_w, 2);
-  EXPECT_EQ(generated_op->padding_h, 2);
-  EXPECT_EQ(generated_op->padding_w, 2);
-
-  // Check associated tensors
-  ASSERT_EQ(inputs.size(), 1) << "Expect 1 input is parsed";
-  ASSERT_TRUE(inputs[0].second == model_stub[0]);
-  CHECK_GENERAL_TENSOR_ATTRIBUTES(
-      inputs[0].second, nullptr, false, tbl::DataType::DT_FLOAT,
-      std::vector<size_t>({1, 1, 5, 5}));
-
-  auto output = model_stub[1];
-  CHECK_GENERAL_TENSOR_ATTRIBUTES(
-      output, generated_op, false, tbl::DataType::DT_FLOAT,
-      std::vector<size_t>({1, 1, 3, 3}));
-}
-
-TEST_F(OnnxParserSingleNodeSingleProcessorTest, ParseMaxPoolAutoPad)
-{
-  std::vector<tbl::Tensor*> model_stub;
-  tbl::PartitionStrategy strategy(
-      reinterpret_cast<tbl::LegionModelState*>(&model_stub),
-      {reinterpret_cast<const tbl::LayerStrategy*>(&layer_strategy_)});
-  std::vector<std::pair<std::string, tbl::Tensor*>> inputs;
-  std::vector<std::pair<std::string, tbl::Tensor*>> outputs;
-  std::vector<tbl::Operator*> layers;
-
-  auto err = tbl::OnnxParser::LoadModel(
-      find_local_processor_fn_,
-      reinterpret_cast<tbl::LegionModelState*>(&model_stub), &strategy,
-      "data/max_pool_autopad.onnx", &inputs, &outputs, &layers);
-  auto expected_err = std::string(
-      "Unsupported attribute value 'SAME_UPPER' for attribute 'auto_pad' in "
-      "'MaxPool' "
-      "layer named '', currently supported value is 'NOTSET'");
-  ASSERT_TRUE(err != nullptr) << "Unexpected successful model load";
-  ASSERT_TRUE(TRITONSERVER_ERROR_UNSUPPORTED == TRITONSERVER_ErrorCode(err))
-      << "Wrong error type" << std::endl
-      << "Actual error type: " << TRITONSERVER_ErrorCodeString(err) << std::endl
-      << "Expected error type: "
-      << "Unsupported";
-  ASSERT_TRUE(expected_err == TRITONSERVER_ErrorMessage(err))
-      << "Wrong error message" << std::endl
-      << "Actual error message: " << TRITONSERVER_ErrorMessage(err) << std::endl
-      << "Expected error message: " << expected_err;
-}
-
-TEST_F(OnnxParserSingleNodeSingleProcessorTest, ParseMaxPoolCeil)
-{
-  std::vector<tbl::Tensor*> model_stub;
-  tbl::PartitionStrategy strategy(
-      reinterpret_cast<tbl::LegionModelState*>(&model_stub),
-      {reinterpret_cast<const tbl::LayerStrategy*>(&layer_strategy_)});
-  std::vector<std::pair<std::string, tbl::Tensor*>> inputs;
-  std::vector<std::pair<std::string, tbl::Tensor*>> outputs;
-  std::vector<tbl::Operator*> layers;
-
-  auto err = tbl::OnnxParser::LoadModel(
-      find_local_processor_fn_,
-      reinterpret_cast<tbl::LegionModelState*>(&model_stub), &strategy,
-      "data/max_pool_ceil.onnx", &inputs, &outputs, &layers);
-  auto expected_err = std::string(
-      "Unsupported attribute value for attribute 'ceil_mode' in 'MaxPool' "
-      "layer named '', currently supported value is 0");
-  ASSERT_TRUE(err != nullptr) << "Unexpected successful model load";
-  ASSERT_TRUE(TRITONSERVER_ERROR_UNSUPPORTED == TRITONSERVER_ErrorCode(err))
-      << "Wrong error type" << std::endl
-      << "Actual error type: " << TRITONSERVER_ErrorCodeString(err) << std::endl
-      << "Expected error type: "
-      << "Unsupported";
-  ASSERT_TRUE(expected_err == TRITONSERVER_ErrorMessage(err))
-      << "Wrong error message" << std::endl
-      << "Actual error message: " << TRITONSERVER_ErrorMessage(err) << std::endl
-      << "Expected error message: " << expected_err;
-}
-
-TEST_F(OnnxParserSingleNodeSingleProcessorTest, ParseMaxPoolDilations)
-{
-  std::vector<tbl::Tensor*> model_stub;
-  tbl::PartitionStrategy strategy(
-      reinterpret_cast<tbl::LegionModelState*>(&model_stub),
-      {reinterpret_cast<const tbl::LayerStrategy*>(&layer_strategy_)});
-  std::vector<std::pair<std::string, tbl::Tensor*>> inputs;
-  std::vector<std::pair<std::string, tbl::Tensor*>> outputs;
-  std::vector<tbl::Operator*> layers;
-
-  auto err = tbl::OnnxParser::LoadModel(
-      find_local_processor_fn_,
-      reinterpret_cast<tbl::LegionModelState*>(&model_stub), &strategy,
-      "data/max_pool_dilations.onnx", &inputs, &outputs, &layers);
-  auto expected_err = std::string(
-      "Unsupported attribute value for attribute 'dilations' in 'MaxPool' "
-      "layer named '', each of the attribute value must be 1");
-  ASSERT_TRUE(err != nullptr) << "Unexpected successful model load";
-  ASSERT_TRUE(TRITONSERVER_ERROR_UNSUPPORTED == TRITONSERVER_ErrorCode(err))
-      << "Wrong error type" << std::endl
-      << "Actual error type: " << TRITONSERVER_ErrorCodeString(err) << std::endl
-      << "Expected error type: "
-      << "Unsupported";
-  ASSERT_TRUE(expected_err == TRITONSERVER_ErrorMessage(err))
-      << "Wrong error message" << std::endl
-      << "Actual error message: " << TRITONSERVER_ErrorMessage(err) << std::endl
-      << "Expected error message: " << expected_err;
-}
-
-TEST_F(OnnxParserSingleNodeSingleProcessorTest, ParseMaxPoolOrder)
-{
-  std::vector<tbl::Tensor*> model_stub;
-  tbl::PartitionStrategy strategy(
-      reinterpret_cast<tbl::LegionModelState*>(&model_stub),
-      {reinterpret_cast<const tbl::LayerStrategy*>(&layer_strategy_)});
-  std::vector<std::pair<std::string, tbl::Tensor*>> inputs;
-  std::vector<std::pair<std::string, tbl::Tensor*>> outputs;
-  std::vector<tbl::Operator*> layers;
-
-  auto err = tbl::OnnxParser::LoadModel(
-      find_local_processor_fn_,
-      reinterpret_cast<tbl::LegionModelState*>(&model_stub), &strategy,
-      "data/max_pool_order.onnx", &inputs, &outputs, &layers);
-  auto expected_err = std::string(
-      "Unsupported attribute value for attribute 'storage_order' in 'MaxPool' "
-      "layer named '', currently supported value is 0");
-  ASSERT_TRUE(err != nullptr) << "Unexpected successful model load";
-  ASSERT_TRUE(TRITONSERVER_ERROR_UNSUPPORTED == TRITONSERVER_ErrorCode(err))
-      << "Wrong error type" << std::endl
-      << "Actual error type: " << TRITONSERVER_ErrorCodeString(err) << std::endl
-      << "Expected error type: "
-      << "Unsupported";
-  ASSERT_TRUE(expected_err == TRITONSERVER_ErrorMessage(err))
-      << "Wrong error message" << std::endl
-      << "Actual error message: " << TRITONSERVER_ErrorMessage(err) << std::endl
-      << "Expected error message: " << expected_err;
-}
-
-TEST_F(OnnxParserSingleNodeSingleProcessorTest, ParseReciprocal)
-{
-  std::vector<tbl::Tensor*> model_stub;
-  tbl::PartitionStrategy strategy(
-      reinterpret_cast<tbl::LegionModelState*>(&model_stub),
-      {reinterpret_cast<const tbl::LayerStrategy*>(&layer_strategy_)});
-  std::vector<std::pair<std::string, tbl::Tensor*>> inputs;
-  std::vector<std::pair<std::string, tbl::Tensor*>> outputs;
-  std::vector<tbl::Operator*> layers;
-
-  auto err = tbl::OnnxParser::LoadModel(
-      find_local_processor_fn_,
-      reinterpret_cast<tbl::LegionModelState*>(&model_stub), &strategy,
-      "data/reciprocal.onnx", &inputs, &outputs, &layers);
-  ASSERT_TRUE(err == nullptr) << TRITONSERVER_ErrorMessage(err);
-
-  ASSERT_EQ(model_stub.size(), 2) << "Expect 2 tensors are parsed";
-  ASSERT_EQ(layers.size(), 1) << "Expect 1 layer is parsed";
-  auto generated_op = dynamic_cast<tbl::UnaryOperator*>(layers[0]);
-  ASSERT_TRUE(generated_op != nullptr)
-      << "Expect the operator to be a UnaryOperator instance";
-
-  CHECK_GENERAL_OPERATOR_ATTRIBUTES(
-      generated_op, tbl::OperatorType::OP_RECIPROCAL, &model_stub,
-      &layer_strategy_, 1, 0, 1);
-  // [gluo FIXME] expected value are set based on knowledge of the model file,
-  // should add sanity check before running test that the files are intact
-  // Check op specific attributes
-  EXPECT_EQ(generated_op->scalar_type, tbl::DataType::DT_FLOAT);
-  EXPECT_EQ(generated_op->inplace, false);
-
-  // Check associated tensors
-  ASSERT_EQ(inputs.size(), 1) << "Expect 1 input is parsed";
-  ASSERT_TRUE(inputs[0].second == model_stub[0]);
-  CHECK_GENERAL_TENSOR_ATTRIBUTES(
-      inputs[0].second, nullptr, false, tbl::DataType::DT_FLOAT,
-      std::vector<size_t>({1, 3}));
-  auto output = model_stub[1];
-  CHECK_GENERAL_TENSOR_ATTRIBUTES(
-      output, generated_op, false, tbl::DataType::DT_FLOAT,
-      std::vector<size_t>({1, 3}));
-}
-
-TEST_F(OnnxParserSingleNodeSingleProcessorTest, ParseSoftmax)
-{
-  std::vector<tbl::Tensor*> model_stub;
-  tbl::PartitionStrategy strategy(
-      reinterpret_cast<tbl::LegionModelState*>(&model_stub),
-      {reinterpret_cast<const tbl::LayerStrategy*>(&layer_strategy_)});
-  std::vector<std::pair<std::string, tbl::Tensor*>> inputs;
-  std::vector<std::pair<std::string, tbl::Tensor*>> outputs;
-  std::vector<tbl::Operator*> layers;
-
-  auto err = tbl::OnnxParser::LoadModel(
-      find_local_processor_fn_,
-      reinterpret_cast<tbl::LegionModelState*>(&model_stub), &strategy,
-      "data/softmax.onnx", &inputs, &outputs, &layers);
-  ASSERT_TRUE(err == nullptr) << TRITONSERVER_ErrorMessage(err);
-
-  ASSERT_EQ(model_stub.size(), 2) << "Expect 2 tensors are parsed";
-  ASSERT_EQ(layers.size(), 1) << "Expect 1 layer is parsed";
-  auto generated_op = dynamic_cast<tbl::Softmax*>(layers[0]);
-  ASSERT_TRUE(generated_op != nullptr)
-      << "Expect the operator to be a Softmax instance";
-
-  CHECK_GENERAL_OPERATOR_ATTRIBUTES(
-      generated_op, tbl::OperatorType::OP_SOFTMAX, &model_stub,
-      &layer_strategy_, 1, 0, 1);
-  EXPECT_EQ(generated_op->dim, 0);
-
-  // // Check associated tensors
-  ASSERT_EQ(inputs.size(), 1) << "Expect 1 input is parsed";
-  ASSERT_TRUE(inputs[0].second == model_stub[0]);
-  CHECK_GENERAL_TENSOR_ATTRIBUTES(
-      inputs[0].second, nullptr, false, tbl::DataType::DT_FLOAT,
-      std::vector<size_t>({3, 1}));
-  auto output = model_stub[1];
-  CHECK_GENERAL_TENSOR_ATTRIBUTES(
-      output, generated_op, false, tbl::DataType::DT_FLOAT,
-      std::vector<size_t>({3, 1}));
-}
-
-TEST_F(OnnxParserSingleNodeSingleProcessorTest, ParseSoftmaxDefaultAxis)
-{
-  std::vector<tbl::Tensor*> model_stub;
-  tbl::PartitionStrategy strategy(
-      reinterpret_cast<tbl::LegionModelState*>(&model_stub),
-      {reinterpret_cast<const tbl::LayerStrategy*>(&layer_strategy_)});
-  std::vector<std::pair<std::string, tbl::Tensor*>> inputs;
-  std::vector<std::pair<std::string, tbl::Tensor*>> outputs;
-  std::vector<tbl::Operator*> layers;
-
-  auto err = tbl::OnnxParser::LoadModel(
-      find_local_processor_fn_,
-      reinterpret_cast<tbl::LegionModelState*>(&model_stub), &strategy,
-      "data/softmax_default_axis.onnx", &inputs, &outputs, &layers);
-  ASSERT_TRUE(err == nullptr) << TRITONSERVER_ErrorMessage(err);
-
-  auto generated_op = dynamic_cast<tbl::Softmax*>(layers[0]);
-  ASSERT_TRUE(generated_op != nullptr)
-      << "Expect the operator to be a Softmax instance";
-  CHECK_GENERAL_OPERATOR_ATTRIBUTES(
-      generated_op, tbl::OperatorType::OP_SOFTMAX, &model_stub,
-      &layer_strategy_, 1, 0, 1);
-  EXPECT_EQ(generated_op->dim, 1);
-}
-
-TEST_F(OnnxParserSingleNodeSingleProcessorTest, ParseSoftmaxNegativeAxis)
-{
-  std::vector<tbl::Tensor*> model_stub;
-  tbl::PartitionStrategy strategy(
-      reinterpret_cast<tbl::LegionModelState*>(&model_stub),
-      {reinterpret_cast<const tbl::LayerStrategy*>(&layer_strategy_)});
-  std::vector<std::pair<std::string, tbl::Tensor*>> inputs;
-  std::vector<std::pair<std::string, tbl::Tensor*>> outputs;
-  std::vector<tbl::Operator*> layers;
-
-  auto err = tbl::OnnxParser::LoadModel(
-      find_local_processor_fn_,
-      reinterpret_cast<tbl::LegionModelState*>(&model_stub), &strategy,
-      "data/softmax_negative_axis.onnx", &inputs, &outputs, &layers);
-  ASSERT_TRUE(err == nullptr) << TRITONSERVER_ErrorMessage(err);
-
-  auto generated_op = dynamic_cast<tbl::Softmax*>(layers[0]);
-  ASSERT_TRUE(generated_op != nullptr)
-      << "Expect the operator to be a Softmax instance";
-  CHECK_GENERAL_OPERATOR_ATTRIBUTES(
-      generated_op, tbl::OperatorType::OP_SOFTMAX, &model_stub,
-      &layer_strategy_, 1, 0, 1);
-  EXPECT_EQ(generated_op->dim, 0);
-}
-
-TEST_F(OnnxParserSingleNodeSingleProcessorTest, ParseSqrt)
-{
-  std::vector<tbl::Tensor*> model_stub;
-  tbl::PartitionStrategy strategy(
-      reinterpret_cast<tbl::LegionModelState*>(&model_stub),
-      {reinterpret_cast<const tbl::LayerStrategy*>(&layer_strategy_)});
-  std::vector<std::pair<std::string, tbl::Tensor*>> inputs;
-  std::vector<std::pair<std::string, tbl::Tensor*>> outputs;
-  std::vector<tbl::Operator*> layers;
-
-  auto err = tbl::OnnxParser::LoadModel(
-      find_local_processor_fn_,
-      reinterpret_cast<tbl::LegionModelState*>(&model_stub), &strategy,
-      "data/sqrt.onnx", &inputs, &outputs, &layers);
-  ASSERT_TRUE(err == nullptr) << TRITONSERVER_ErrorMessage(err);
-
-  ASSERT_EQ(model_stub.size(), 2) << "Expect 2 tensors are parsed";
-  ASSERT_EQ(layers.size(), 1) << "Expect 1 layer is parsed";
-  auto generated_op = dynamic_cast<tbl::UnaryOperator*>(layers[0]);
-  ASSERT_TRUE(generated_op != nullptr)
-      << "Expect the operator to be a UnaryOperator instance";
-
-  CHECK_GENERAL_OPERATOR_ATTRIBUTES(
-      generated_op, tbl::OperatorType::OP_SQRT, &model_stub, &layer_strategy_,
-      1, 0, 1);
-  EXPECT_EQ(generated_op->scalar_type, tbl::DataType::DT_FLOAT);
-  EXPECT_EQ(generated_op->inplace, false);
-
-  // Check associated tensors
-  ASSERT_EQ(inputs.size(), 1) << "Expect 1 input is parsed";
-  ASSERT_TRUE(inputs[0].second == model_stub[0]);
-  CHECK_GENERAL_TENSOR_ATTRIBUTES(
-      inputs[0].second, nullptr, false, tbl::DataType::DT_FLOAT,
-      std::vector<size_t>({3, 1}));
-  auto output = model_stub[1];
-  CHECK_GENERAL_TENSOR_ATTRIBUTES(
-      output, generated_op, false, tbl::DataType::DT_FLOAT,
-      std::vector<size_t>({3, 1}));
-}
-
-TEST_F(OnnxParserSingleNodeSingleProcessorTest, ParseTanh)
-{
-  std::vector<tbl::Tensor*> model_stub;
-  tbl::PartitionStrategy strategy(
-      reinterpret_cast<tbl::LegionModelState*>(&model_stub),
-      {reinterpret_cast<const tbl::LayerStrategy*>(&layer_strategy_)});
-  std::vector<std::pair<std::string, tbl::Tensor*>> inputs;
-  std::vector<std::pair<std::string, tbl::Tensor*>> outputs;
-  std::vector<tbl::Operator*> layers;
-
-  auto err = tbl::OnnxParser::LoadModel(
-      find_local_processor_fn_,
-      reinterpret_cast<tbl::LegionModelState*>(&model_stub), &strategy,
-      "data/tanh.onnx", &inputs, &outputs, &layers);
-  ASSERT_TRUE(err == nullptr) << TRITONSERVER_ErrorMessage(err);
-
-  ASSERT_EQ(model_stub.size(), 2) << "Expect 2 tensors are parsed";
-  ASSERT_EQ(layers.size(), 1) << "Expect 1 layer is parsed";
-  auto generated_op = dynamic_cast<tbl::UnaryOperator*>(layers[0]);
-  ASSERT_TRUE(generated_op != nullptr)
-      << "Expect the operator to be a UnaryOperator instance";
-
-  CHECK_GENERAL_OPERATOR_ATTRIBUTES(
-      generated_op, tbl::OperatorType::OP_TANH, &model_stub, &layer_strategy_,
-      1, 0, 1);
-  EXPECT_EQ(generated_op->scalar_type, tbl::DataType::DT_FLOAT);
-  EXPECT_EQ(generated_op->inplace, false);
-
-  // Check associated tensors
-  ASSERT_EQ(inputs.size(), 1) << "Expect 1 input is parsed";
-  ASSERT_TRUE(inputs[0].second == model_stub[0]);
-  CHECK_GENERAL_TENSOR_ATTRIBUTES(
-      inputs[0].second, nullptr, false, tbl::DataType::DT_FLOAT,
-      std::vector<size_t>({3, 1}));
-  auto output = model_stub[1];
-  CHECK_GENERAL_TENSOR_ATTRIBUTES(
-      output, generated_op, false, tbl::DataType::DT_FLOAT,
-      std::vector<size_t>({3, 1}));
-}
-
-
-}  // namespace
-
-int
-main(int argc, char** argv)
-{
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/triton/src/test/scripts/onnx_maker.py b/triton/src/test/scripts/onnx_maker.py
deleted file mode 100755
index 0983d097be..0000000000
--- a/triton/src/test/scripts/onnx_maker.py
+++ /dev/null
@@ -1,613 +0,0 @@
-#------------------------------------------------------------------------------#
-# Copyright 2022 NVIDIA CORPORATION
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#------------------------------------------------------------------------------#
-
-from onnx import helper
-from onnx import TensorProto as tp
-from onnx import checker
-from onnx import save
-import sys
-import argparse
-import os
-
-## Add
-
-
-def binary_models(path):
-    binary_node_names = ["Add", "Sub", "Mul"]
-    for node_name in binary_node_names:
-        binary(path, node_name)
-
-
-def binary(path, node_name):
-    node = helper.make_node(
-        node_name,
-        inputs=['input0', 'input1'],
-        outputs=['output'],
-    )
-    graph = helper.make_graph([node], 'test_graph', [
-        helper.make_tensor_value_info('input0', tp.FLOAT, [4, 2]),
-        helper.make_tensor_value_info('input1', tp.FLOAT, [4, 2])
-    ], [helper.make_tensor_value_info('output', tp.FLOAT, [4, 2])])
-    model = helper.make_model(graph, producer_name='model')
-    checker.check_model(model)
-    save(model, os.path.join(path, '{}.onnx'.format(node_name.lower())))
-
-
-## Average Pool
-
-
-def avg_pool_models(path):
-    avg_pool(path)
-    avg_pool_autopad(path)
-    avg_pool_ceil(path)
-    avg_pool_count_include_pad(path)
-    avg_pool_pad(path)
-
-
-def avg_pool(path):
-    node = helper.make_node('AveragePool',
-                            inputs=['input'],
-                            outputs=['output'],
-                            kernel_shape=[2, 2])
-    graph = helper.make_graph(
-        [node], 'test_graph',
-        [helper.make_tensor_value_info('input', tp.FLOAT, [1, 3, 30, 30])],
-        [helper.make_tensor_value_info('output', tp.FLOAT, [1, 3, 29, 29])])
-    model = helper.make_model(graph, producer_name='model')
-    checker.check_model(model)
-    save(model, os.path.join(path, 'avg_pool.onnx'))
-
-
-def avg_pool_autopad(path):
-    node = helper.make_node('AveragePool',
-                            inputs=['input'],
-                            outputs=['output'],
-                            kernel_shape=[2, 2],
-                            auto_pad='SAME_LOWER')
-    graph = helper.make_graph(
-        [node], 'test_graph',
-        [helper.make_tensor_value_info('input', tp.FLOAT, [1, 3, 30, 30])],
-        [helper.make_tensor_value_info('output', tp.FLOAT, [1, 3, 29, 29])])
-    model = helper.make_model(graph, producer_name='model')
-    checker.check_model(model)
-    save(model, os.path.join(path, 'avg_pool_autopad.onnx'))
-
-
-def avg_pool_ceil(path):
-    node = helper.make_node('AveragePool',
-                            inputs=['input'],
-                            outputs=['output'],
-                            kernel_shape=[2, 2],
-                            ceil_mode=True)
-    graph = helper.make_graph(
-        [node], 'test_graph',
-        [helper.make_tensor_value_info('input', tp.FLOAT, [1, 3, 30, 30])],
-        [helper.make_tensor_value_info('output', tp.FLOAT, [1, 3, 29, 29])])
-    model = helper.make_model(graph, producer_name='model')
-    checker.check_model(model)
-    save(model, os.path.join(path, 'avg_pool_ceil.onnx'))
-
-
-def avg_pool_count_include_pad(path):
-    node = helper.make_node('AveragePool',
-                            inputs=['input'],
-                            outputs=['output'],
-                            kernel_shape=[2, 2],
-                            count_include_pad=2)
-    graph = helper.make_graph(
-        [node], 'test_graph',
-        [helper.make_tensor_value_info('input', tp.FLOAT, [1, 3, 30, 30])],
-        [helper.make_tensor_value_info('output', tp.FLOAT, [1, 3, 29, 29])])
-    model = helper.make_model(graph, producer_name='model')
-    checker.check_model(model)
-    save(model, os.path.join(path, 'avg_pool_count_include_pad.onnx'))
-
-
-def avg_pool_pad(path):
-    node = helper.make_node(
-        'AveragePool',
-        inputs=['input'],
-        outputs=['output'],
-        kernel_shape=[2, 2],
-        strides=[3, 3],
-        pads=[1, 1, 1, 1],
-    )
-    graph = helper.make_graph(
-        [node], 'test_graph',
-        [helper.make_tensor_value_info('input', tp.FLOAT, [1, 3, 30, 30])],
-        [helper.make_tensor_value_info('output', tp.FLOAT, [1, 3, 11, 11])])
-    model = helper.make_model(graph, producer_name='model')
-    checker.check_model(model)
-    save(model, os.path.join(path, 'avg_pool_pad.onnx'))
-
-
-## Cast
-
-
-def cast_models(path):
-    cast(path)
-
-
-def cast(path):
-    node = helper.make_node(
-        'Cast',
-        inputs=['input'],
-        outputs=['output'],
-        to=getattr(tp, 'DOUBLE'),
-    )
-    graph = helper.make_graph(
-        [node], 'test_graph',
-        [helper.make_tensor_value_info('input', tp.FLOAT, [1, 3])],
-        [helper.make_tensor_value_info('output', tp.DOUBLE, [1, 3])])
-    model = helper.make_model(graph, producer_name='model')
-    checker.check_model(model)
-    save(model, os.path.join(path, 'cast.onnx'))
-
-
-## Conv
-
-
-def conv_models(path):
-    conv(path)
-    conv_strides(path)
-
-
-def conv(path):
-    node = helper.make_node(
-        'Conv',
-        inputs=['input0', 'input1'],
-        outputs=['output'],
-        kernel_shape=[3, 3],
-        pads=[1, 1, 1, 1],
-    )
-    graph = helper.make_graph([node], 'test_graph', [
-        helper.make_tensor_value_info('input0', tp.FLOAT, [1, 1, 5, 5]),
-        helper.make_tensor_value_info('input1', tp.FLOAT, [1, 1, 3, 3])
-    ], [helper.make_tensor_value_info('output', tp.FLOAT, [1, 1, 5, 5])])
-    model = helper.make_model(graph, producer_name='model')
-    checker.check_model(model)
-    save(model, os.path.join(path, 'conv.onnx'))
-
-
-def conv_strides(path):
-    node = helper.make_node(
-        'Conv',
-        inputs=['input0', 'input1'],
-        outputs=['output'],
-        kernel_shape=[3, 3],
-        pads=[1, 0, 1, 0],
-        strides=[2, 2],
-    )
-    graph = helper.make_graph([node], 'test_graph', [
-        helper.make_tensor_value_info('input0', tp.FLOAT, [1, 1, 5, 5]),
-        helper.make_tensor_value_info('input1', tp.FLOAT, [1, 1, 3, 3])
-    ], [helper.make_tensor_value_info('output', tp.FLOAT, [1, 1, 4, 2])])
-    model = helper.make_model(graph, producer_name='model')
-    checker.check_model(model)
-    save(model, os.path.join(path, 'conv_strides.onnx'))
-
-
-def conv_autopad(path):
-    node = helper.make_node(
-        'Conv',
-        inputs=['input0', 'input1'],
-        outputs=['output'],
-        auto_pad='SAME_LOWER',
-        kernel_shape=[3, 3],
-        strides=[2, 2],
-    )
-    graph = helper.make_graph([node], 'test_graph', [
-        helper.make_tensor_value_info('input0', tp.FLOAT, [1, 1, 5, 5]),
-        helper.make_tensor_value_info('input1', tp.FLOAT, [1, 1, 3, 3])
-    ], [helper.make_tensor_value_info('output', tp.FLOAT, [3, 3])])
-    model = helper.make_model(graph, producer_name='model')
-    checker.check_model(model)
-    save(model, os.path.join(path, 'conv_autopad.onnx'))
-
-
-## Flatten
-
-
-def flatten_models(path):
-    flatten(path)
-    flatten_default_axis(path)
-    flatten_negative_axis(path)
-
-
-def flatten(path):
-    node = helper.make_node(
-        'Flatten',
-        inputs=['input'],
-        outputs=['output'],
-        axis=1,
-    )
-    graph = helper.make_graph(
-        [node], 'test_graph',
-        [helper.make_tensor_value_info('input', tp.FLOAT, [5, 4, 3, 2])],
-        [helper.make_tensor_value_info('output', tp.FLOAT, [5, 24])])
-    model = helper.make_model(graph, producer_name='model')
-    checker.check_model(model)
-    save(model, os.path.join(path, 'flatten.onnx'))
-
-
-def flatten_default_axis(path):
-    node = helper.make_node(
-        'Flatten',
-        inputs=['input'],
-        outputs=['output'],
-    )
-    graph = helper.make_graph(
-        [node], 'test_graph',
-        [helper.make_tensor_value_info('input', tp.FLOAT, [5, 4, 3, 2])],
-        [helper.make_tensor_value_info('output', tp.FLOAT, [5, 24])])
-    model = helper.make_model(graph, producer_name='model')
-    checker.check_model(model)
-    save(model, os.path.join(path, 'flatten_default_axis.onnx'))
-
-
-def flatten_negative_axis(path):
-    node = helper.make_node(
-        'Flatten',
-        inputs=['input'],
-        outputs=['output'],
-        axis=-4,
-    )
-    graph = helper.make_graph(
-        [node], 'test_graph',
-        [helper.make_tensor_value_info('input', tp.FLOAT, [5, 4, 3, 2])],
-        [helper.make_tensor_value_info('output', tp.FLOAT, [1, 120])])
-    model = helper.make_model(graph, producer_name='model')
-    checker.check_model(model)
-    save(model, os.path.join(path, 'flatten_negative_axis.onnx'))
-
-
-## Identity
-
-
-def identity_models(path):
-    identity(path)
-
-
-def identity(path):
-    node = helper.make_node(
-        'Identity',
-        inputs=['input'],
-        outputs=['output'],
-    )
-    graph = helper.make_graph(
-        [node], 'test_graph',
-        [helper.make_tensor_value_info('input', tp.FLOAT, [4, 1, 5, 5])],
-        [helper.make_tensor_value_info('output', tp.FLOAT, [4, 1, 5, 5])])
-    model = helper.make_model(graph, producer_name='model')
-    checker.check_model(model)
-    save(model, os.path.join(path, 'identity.onnx'))
-
-
-## Max Pool
-
-
-def max_pool_models(path):
-    max_pool(path)
-    max_pool_ceil(path)
-    max_pool_dilations(path)
-    max_pool_order(path)
-
-
-def max_pool(path):
-    node = helper.make_node('MaxPool',
-                            inputs=['input'],
-                            outputs=['output'],
-                            kernel_shape=[5, 5],
-                            pads=[2, 2, 2, 2],
-                            strides=[2, 2])
-    graph = helper.make_graph(
-        [node], 'test_graph',
-        [helper.make_tensor_value_info('input', tp.FLOAT, [1, 1, 5, 5])],
-        [helper.make_tensor_value_info('output', tp.FLOAT, [1, 1, 3, 3])])
-    model = helper.make_model(graph, producer_name='model')
-    checker.check_model(model)
-    save(model, os.path.join(path, 'max_pool.onnx'))
-
-
-def max_pool_autopad(path):
-    node = helper.make_node('MaxPool',
-                            inputs=['input'],
-                            outputs=['output'],
-                            kernel_shape=[5, 5],
-                            strides=[2, 2],
-                            auto_pad='SAME_UPPER')
-    graph = helper.make_graph(
-        [node], 'test_graph',
-        [helper.make_tensor_value_info('input', tp.FLOAT, [1, 1, 5, 5])],
-        [helper.make_tensor_value_info('output', tp.FLOAT, [1, 1, 3, 3])])
-    model = helper.make_model(graph, producer_name='model')
-    checker.check_model(model)
-    save(model, os.path.join(path, 'max_pool_autopad.onnx'))
-
-
-def max_pool_ceil(path):
-    node = helper.make_node('MaxPool',
-                            inputs=['input'],
-                            outputs=['output'],
-                            kernel_shape=[5, 5],
-                            strides=[2, 2],
-                            ceil_mode=True)
-    graph = helper.make_graph(
-        [node], 'test_graph',
-        [helper.make_tensor_value_info('input', tp.FLOAT, [1, 1, 5, 5])],
-        [helper.make_tensor_value_info('output', tp.FLOAT, [1, 1, 3, 3])])
-    model = helper.make_model(graph, producer_name='model')
-    checker.check_model(model)
-    save(model, os.path.join(path, 'max_pool_ceil.onnx'))
-
-
-def max_pool_dilations(path):
-    node = helper.make_node('MaxPool',
-                            inputs=['input'],
-                            outputs=['output'],
-                            kernel_shape=[5, 5],
-                            strides=[2, 2],
-                            dilations=[2, 2])
-    graph = helper.make_graph(
-        [node], 'test_graph',
-        [helper.make_tensor_value_info('input', tp.FLOAT, [1, 1, 5, 5])],
-        [helper.make_tensor_value_info('output', tp.FLOAT, [1, 1, 3, 3])])
-    model = helper.make_model(graph, producer_name='model')
-    checker.check_model(model)
-    save(model, os.path.join(path, 'max_pool_dilations.onnx'))
-
-
-def max_pool_order(path):
-    node = helper.make_node('MaxPool',
-                            inputs=['input'],
-                            outputs=['output'],
-                            kernel_shape=[5, 5],
-                            strides=[2, 2],
-                            storage_order=1)
-    graph = helper.make_graph(
-        [node], 'test_graph',
-        [helper.make_tensor_value_info('input', tp.FLOAT, [1, 1, 5, 5])],
-        [helper.make_tensor_value_info('output', tp.FLOAT, [1, 1, 3, 3])])
-    model = helper.make_model(graph, producer_name='model')
-    checker.check_model(model)
-    save(model, os.path.join(path, 'max_pool_order.onnx'))
-
-
-## Reciprocal
-
-
-def reciprocal_models(path):
-    reciprocal(path)
-
-
-def reciprocal(path):
-    node = helper.make_node(
-        'Reciprocal',
-        inputs=['input'],
-        outputs=['output'],
-    )
-    graph = helper.make_graph(
-        [node], 'test_graph',
-        [helper.make_tensor_value_info('input', tp.FLOAT, [1, 3])],
-        [helper.make_tensor_value_info('output', tp.FLOAT, [1, 3])])
-    model = helper.make_model(graph, producer_name='model')
-    checker.check_model(model)
-    save(model, os.path.join(path, 'reciprocal.onnx'))
-
-
-## Relu
-
-
-def relu_models(path):
-    relu(path)
-
-
-def relu(path):
-    node = helper.make_node(
-        'Relu',
-        inputs=['input'],
-        outputs=['output'],
-    )
-    graph = helper.make_graph(
-        [node], 'test_graph',
-        [helper.make_tensor_value_info('input', tp.FLOAT, [1, 3])],
-        [helper.make_tensor_value_info('output', tp.FLOAT, [1, 3])])
-    model = helper.make_model(graph, producer_name='model')
-    checker.check_model(model)
-    save(model, os.path.join(path, 'relu.onnx'))
-
-
-## Reshape
-
-
-def reshape_models(path):
-    reshape(path)
-    reshape_allow_zero(path)
-    reshape_reject_zero(path)
-
-
-def reshape(path):
-    shape = [2, 3, 4]
-    new_shape = [1, 1, 24]
-    reshape_dims = [
-        3,
-    ]
-    node = helper.make_node('Reshape',
-                            inputs=['input', 'shape'],
-                            outputs=['output'])
-    graph = helper.make_graph([node], 'test_graph', [
-        helper.make_tensor_value_info('input', tp.FLOAT, shape),
-        helper.make_tensor_value_info('shape', tp.INT64, reshape_dims)
-    ], [helper.make_tensor_value_info('output', tp.FLOAT, new_shape)])
-    model = helper.make_model(graph, producer_name='model')
-    checker.check_model(model)
-    save(model, os.path.join(path, 'reshape.onnx'))
-
-
-def reshape_allow_zero(path):
-    shape = [0, 3, 4]
-    new_shape = [3, 4, 0]
-    reshape_dims = [3, 4, 0]
-    node = helper.make_node('Reshape',
-                            inputs=['input', 'shape'],
-                            outputs=['output'],
-                            allowzero=1)
-    graph = helper.make_graph([node], 'test_graph', [
-        helper.make_tensor_value_info('input', tp.FLOAT, shape),
-        helper.make_tensor_value_info('shape', tp.INT64, reshape_dims)
-    ], [helper.make_tensor_value_info('output', tp.FLOAT, new_shape)])
-    model = helper.make_model(graph, producer_name='model')
-    checker.check_model(model)
-    save(model, os.path.join(path, 'reshape_accept_zero.onnx'))
-
-
-def reshape_reject_zero(path):
-    shape = [0, 3, 4]
-    new_shape = [3, 4, 0]
-    reshape_dims = [3, 4, 4]
-    node = helper.make_node('Reshape',
-                            inputs=['input', 'shape'],
-                            outputs=['output'],
-                            allowzero=0)
-    graph = helper.make_graph([node], 'test_graph', [
-        helper.make_tensor_value_info('input', tp.FLOAT, shape),
-        helper.make_tensor_value_info('shape', tp.INT64, reshape_dims)
-    ], [helper.make_tensor_value_info('output', tp.FLOAT, new_shape)])
-    model = helper.make_model(graph, producer_name='model')
-    checker.check_model(model)
-    save(model, os.path.join(path, 'reshape_reject_zero.onnx'))
-
-
-# Softmax
-
-
-def softmax_models(path):
-    softmax(path)
-    softmax_default_axis(path)
-    softmax_negative_axis(path)
-
-
-def softmax(path):
-    node = helper.make_node(
-        'Softmax',
-        inputs=['input'],
-        outputs=['output'],
-        axis=0,
-    )
-    graph = helper.make_graph(
-        [node], 'test_graph',
-        [helper.make_tensor_value_info('input', tp.FLOAT, [3, 1])],
-        [helper.make_tensor_value_info('output', tp.FLOAT, [3, 1])])
-    model = helper.make_model(graph, producer_name='model')
-    checker.check_model(model)
-    save(model, os.path.join(path, 'softmax.onnx'))
-
-
-def softmax_default_axis(path):
-    node = helper.make_node(
-        'Softmax',
-        inputs=['input'],
-        outputs=['output'],
-    )
-    graph = helper.make_graph(
-        [node], 'test_graph',
-        [helper.make_tensor_value_info('input', tp.FLOAT, [3, 1])],
-        [helper.make_tensor_value_info('output', tp.FLOAT, [3, 1])])
-    model = helper.make_model(graph, producer_name='model')
-    checker.check_model(model)
-    save(model, os.path.join(path, 'softmax_default_axis.onnx'))
-
-
-def softmax_negative_axis(path):
-    node = helper.make_node('Softmax',
-                            inputs=['input'],
-                            outputs=['output'],
-                            axis=-2)
-    graph = helper.make_graph(
-        [node], 'test_graph',
-        [helper.make_tensor_value_info('input', tp.FLOAT, [3, 1])],
-        [helper.make_tensor_value_info('output', tp.FLOAT, [3, 1])])
-    model = helper.make_model(graph, producer_name='model')
-    checker.check_model(model)
-    save(model, os.path.join(path, 'softmax_negative_axis.onnx'))
-
-
-## Sqrt
-
-
-def sqrt_models(path):
-    sqrt(path)
-
-
-def sqrt(path):
-    node = helper.make_node(
-        'Sqrt',
-        inputs=['input'],
-        outputs=['output'],
-    )
-    graph = helper.make_graph(
-        [node], 'test_graph',
-        [helper.make_tensor_value_info('input', tp.FLOAT, [3, 1])],
-        [helper.make_tensor_value_info('output', tp.FLOAT, [3, 1])])
-    model = helper.make_model(graph, producer_name='model')
-    checker.check_model(model)
-    save(model, os.path.join(path, 'sqrt.onnx'))
-
-
-## Tanh
-
-
-def tanh_models(path):
-    tanh(path)
-
-
-def tanh(path):
-    node = helper.make_node(
-        'Tanh',
-        inputs=['input'],
-        outputs=['output'],
-    )
-    graph = helper.make_graph(
-        [node], 'test_graph',
-        [helper.make_tensor_value_info('input', tp.FLOAT, [3, 1])],
-        [helper.make_tensor_value_info('output', tp.FLOAT, [3, 1])])
-    model = helper.make_model(graph, producer_name='model')
-    checker.check_model(model)
-    save(model, os.path.join(path, 'tanh.onnx'))
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--model-directory',
-                        required=True,
-                        help='The directory to store the generated models')
-
-    FLAGS = parser.parse_args()
-    path = FLAGS.model_directory
-
-    binary_models(path)
-    avg_pool_models(path)
-    cast(path)
-    conv_models(path)
-    flatten_models(path)
-    identity_models(path)
-    max_pool_models(path)
-    reciprocal_models(path)
-    reshape_models(path)
-    relu_models(path)
-    softmax_models(path)
-    sqrt_models(path)
-    tanh_models(path)
diff --git a/triton/src/types.h b/triton/src/types.h
deleted file mode 100644
index a034d5f685..0000000000
--- a/triton/src/types.h
+++ /dev/null
@@ -1,187 +0,0 @@
-/* Copyright 2022 NVIDIA CORPORATION
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __LEGION_TRITON_TYPES_H__
-#define __LEGION_TRITON_TYPES_H__
-
-#include <cassert>
-#include <cstddef>
-#include <cstdint>
-
-namespace triton { namespace backend { namespace legion {
-
-enum DataType {
-  DT_HALF,
-  DT_FLOAT,
-  DT_DOUBLE,
-  DT_INT8,
-  DT_INT16,
-  DT_INT32,
-  DT_INT64,
-  DT_UINT8,
-  DT_UINT16,
-  DT_UINT32,
-  DT_UINT64,
-  DT_BOOLEAN,
-  DT_NONE,
-};
-
-enum FieldName {
-  FID_DATA,
-};
-
-static inline size_t
-sizeof_datatype(DataType dt)
-{
-  assert(dt < DT_NONE);
-  static const size_t sizes[DT_NONE] = {
-      2,  // hard-code this since it's hard to express sizeof(__half)
-      sizeof(float),
-      sizeof(double),
-      sizeof(int8_t),
-      sizeof(int16_t),
-      sizeof(int32_t),
-      sizeof(int64_t),
-      sizeof(uint8_t),
-      sizeof(uint16_t),
-      sizeof(uint32_t),
-      sizeof(uint64_t),
-      sizeof(bool),
-  };
-  return sizes[dt];
-}
-
-enum ActivationMode {
-  AC_MODE_NONE,
-  AC_MODE_RELU,
-  AC_MODE_SIGMOID,
-  AC_MODE_TANH,
-  AC_MODE_GELU,
-};
-
-enum PoolType {
-  POOL_MAX,
-  POOL_AVG,
-};
-
-enum OperatorType {
-  OP_INPUT,
-  OP_WEIGHT,
-  OP_NOOP,
-  OP_CONV2D,
-  OP_DROPOUT,
-  OP_LINEAR,
-  OP_BATCHMATMUL,
-  OP_POOL2D,
-  OP_SCALAR_ADD,
-  OP_SCALAR_SUB,
-  OP_SCALAR_MULTIPLY,
-  OP_SCALAR_TRUE_DIV,
-  OP_RELU,
-  OP_IDENTITY,
-  OP_SIGMOID,
-  OP_TANH,
-  OP_ELU,
-  OP_FLAT,
-  OP_SOFTMAX,
-  OP_BATCHNORM,
-  OP_CONCAT,
-  OP_SPLIT,
-  OP_EMBEDDING,
-  OP_GROUP_BY,
-  OP_AGGREGATE,
-  // OP_ELEMENTWISE,
-  OP_RESHAPE,
-  OP_REVERSE,
-  OP_TRANSPOSE,
-  OP_EW_ADD,
-  OP_EW_MUL,
-  OP_MATMUL,
-  OP_MUL,
-  OP_ENLARGE,
-  OP_MERGE_GCONV,
-  OP_CONSTANT_IMM,
-  OP_CONSTANT_ICONV,
-  OP_CONSTANT_ONE,
-  OP_CONSTANT_POOL,
-  OP_SQUEEZE,  // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Squeeze
-  OP_UNSQUEEZE,  // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Unsqueeze
-  OP_EW_SUB,  // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Sub
-  OP_EW_DIV,  // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Div
-  OP_EW_EQUAL,  // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Equal
-  OP_EW_GREATER,  // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Greater
-  OP_EW_LESS,  // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Less
-  OP_EW_MAX,   // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Max
-  OP_EW_MIN,   // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Min
-  OP_RECIPROCAL,  // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Reciprocal
-  OP_REDUCE_ARGMAX,  // https://github.com/onnx/onnx/blob/master/docs/Operators.md#ArgMax
-  OP_REDUCE_ARGMIN,  // https://github.com/onnx/onnx/blob/master/docs/Operators.md#ArgMin
-  OP_REDUCE_MAX,  // https://github.com/onnx/onnx/blob/master/docs/Operators.md#ReduceMax
-  OP_REDUCE_MEAN,  // https://github.com/onnx/onnx/blob/master/docs/Operators.md#ReduceMean
-  OP_REDUCE_MIN,  // https://github.com/onnx/onnx/blob/master/docs/Operators.md#ReduceMin
-  OP_REDUCE_PROD,  // https://github.com/onnx/onnx/blob/master/docs/Operators.md#ReduceProd
-  OP_REDUCE_SUM,  // https://github.com/onnx/onnx/blob/master/docs/Operators.md#ReduceSum
-  OP_PAD,  // https://github.com/dmlc/tvm/blob/master/topi/python/topi/nn/pad.py
-  OP_SHAPE,  // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Shape
-  OP_SIZE,   // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Size
-  OP_TOPK,   // https://github.com/onnx/onnx/blob/master/docs/Operators.md#TopK
-  OP_WHERE,  // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Where
-  OP_CEIL,   // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Ceil
-  OP_CAST,   // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Cast
-  OP_EXP,    // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Exp
-  OP_ROUND,  // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Round
-  OP_LOG,    // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Log
-  OP_LOGICAL_NOT,  // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Not
-  OP_SQRT,  // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Sqrt
-  OP_LEAKYRELU,
-  OP_SLICE,  // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Slice
-  OP_RESIZE,  // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Resize
-  OP_PRELU,  // https://github.com/onnx/onnx/blob/master/docs/Operators.md#PRelu
-  OP_GELU,
-  OP_MULTIHEAD_ATTENTION,
-  OP_FUSED,  // Fused operator type for internal fusion optimizations
-  // Parallel Ops
-  OP_REPARTITION,
-  OP_COMBINE,
-  OP_REPLICATE,
-  OP_REDUCTION,
-  OP_PIPELINE,
-  OP_FUSED_PARALLEL,
-};
-
-enum LayerTaskID {
-  BINARY_TASK_ID,
-  CONCAT_TASK_ID,
-  CONV2D_TASK_ID,
-  MATMUL_TASK_ID,
-  RESHAPE_TASK_ID,
-  SOFTMAX_TASK_ID,
-  UNARY_TASK_ID,
-};
-
-// forward declarations of some types
-class Tensor;
-class Weights;
-class Operator;
-struct InputTensor;
-struct OutputTensor;
-struct PartitionStrategy;
-class LegionModelState;
-class LegionModelInstance;
-class LegionTritonRuntime;
-
-}}}  // namespace triton::backend::legion
-
-#endif  // __LEGION_TRITON_TYPES_H__