From 093b29d3ed0e159bf1702de024370ee24b92a755 Mon Sep 17 00:00:00 2001
From: Zhihao Jia <zhihao@cmu.edu>
Date: Tue, 2 Jan 2024 18:50:49 -0500
Subject: [PATCH 01/12] bug fixes and update Legion version

---
 deps/legion                    | 2 +-
 src/ops/linear.cc              | 8 ++------
 src/runtime/model.cu           | 1 -
 src/runtime/request_manager.cc | 7 ++-----
 4 files changed, 5 insertions(+), 13 deletions(-)

diff --git a/deps/legion b/deps/legion
index 626b55689c..d065278678 160000
--- a/deps/legion
+++ b/deps/legion
@@ -1 +1 @@
-Subproject commit 626b55689c77848b246e1da19678c7ad58899f0c
+Subproject commit d0652786784249e933dd62f675591da99a5e960d
diff --git a/src/ops/linear.cc b/src/ops/linear.cc
index 63b26bfe7d..93e93a5953 100644
--- a/src/ops/linear.cc
+++ b/src/ops/linear.cc
@@ -467,12 +467,8 @@ OpMeta *Linear::init_task_with_dim(Task const *task,
                                        ctx,
                                        runtime,
                                        false /*readOutput*/);
-  TensorAccessorW<WT, NDIM> acc_kernel(regions[2],
-                                       task->regions[2],
-                                       FID_DATA,
-                                       ctx,
-                                       runtime,
-                                       false /*readOutput*/);
+  TensorAccessorR<WT, NDIM> acc_kernel(
+      regions[2], task->regions[2], FID_DATA, ctx, runtime);
 
   // TensorAccessorR<float, 1> acc_bias(
   //     regions[3], task->regions[3], FID_DATA, ctx, runtime);
diff --git a/src/runtime/model.cu b/src/runtime/model.cu
index c885b29db2..23b7f0efbe 100644
--- a/src/runtime/model.cu
+++ b/src/runtime/model.cu
@@ -175,7 +175,6 @@ FFHandler
   } else {
     handle.batch_config_metadata = nullptr;
   }
-   
 
   // checkCUDA(cudaMalloc(&handle.workSpace, handle.workSpaceSize));
 #ifdef FF_USE_NCCL
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 88754f5a82..a285932b7f 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1188,10 +1188,7 @@ BeamSearchBatchConfig
       int ssm_decoding_steps =
           profiling_requests[request.guid].ssm_decoding_steps;
 
-      new_bc.beamRequestsInfo[i].beam_size =
-          spec_infer_tree_width.size() > ssm_decoding_steps
-              ? spec_infer_tree_width[ssm_decoding_steps]
-              : 1;
+      new_bc.beamRequestsInfo[i].beam_size = 1;
       // printf("beam size: %d, %d\n",
       //        new_bc.beamRequestsInfo[i].beam_size,
       //        ssm_decoding_steps);
@@ -1820,7 +1817,7 @@ void RequestManager::updateBitMask(BatchConfig::BitMask &bitmask,
 void RequestManager::appendPendingRequest(BatchConfig::BitMask &bitmask,
                                           int initLength) {
   assert(initLength > 0);
-  std::cout << "append pending bit mask: " << initLength << "\n";
+  // std::cout << "append pending bit mask: " << initLength << "\n";
   // eg. 4 tokens: t1: 0000000..1111, t2: 0000000..1110, t3: 0000000..1100, t4:
   // 0000000..1000
   bitmask.non_tree_cache_size = 0;

From c8d2cd19110cb1e6eba8d5554918e7f9762aba50 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 9 Jan 2024 21:35:00 +0000
Subject: [PATCH 02/12] fix

---
 CMakeLists.txt | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 90cab126e6..3fbd06c74e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -413,6 +413,7 @@ if(NOT BUILD_LEGION_ONLY)
 
   # python related
   if (FF_USE_PYTHON)
+    find_package(Python COMPONENTS Interpreter Development)
     # create flexflow_cffi_header.py
     add_custom_command(TARGET flexflow
       PRE_BUILD	
@@ -424,13 +425,13 @@ if(NOT BUILD_LEGION_ONLY)
       # generate the Legion Python bindings library. When building from pip, we need to do this post-install to prevent Legion from overwriting the path to the Legion shared library
       add_custom_command(TARGET flexflow
         POST_BUILD	
-        COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python/setup.py build --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${Legion_BINARY_DIR} --build-lib=${Legion_BINARY_DIR}/bindings/python ${Legion_PYTHON_EXTRA_INSTALL_ARGS}
+        COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python/setup.py build --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${Legion_BINARY_DIR} --build-lib=${Legion_BINARY_DIR}/bindings/python ${Legion_PYTHON_EXTRA_INSTALL_ARGS}
         WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python
       )
       # create flexflow_python interpreter. When building from pip, we install the FF_HOME/python/flexflow_python script instead.
       add_custom_command(TARGET flexflow
         PRE_BUILD	
-        COMMAND ${PYTHON_EXECUTABLE} ${FLEXFLOW_ROOT}/python/flexflow_python_build.py --build-dir ${CMAKE_BINARY_DIR}
+        COMMAND ${Python_EXECUTABLE} ${FLEXFLOW_ROOT}/python/flexflow_python_build.py --build-dir ${CMAKE_BINARY_DIR}
         WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
         COMMENT "Creating flexflow_python interpreter..."
       )
@@ -567,7 +568,8 @@ if(NOT BUILD_LEGION_ONLY)
   install(TARGETS flexflow DESTINATION ${LIB_DEST})
   # install python
   if (FF_USE_PYTHON)
-    execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE)
+    find_package(Python COMPONENTS Interpreter Development)
+    execute_process(COMMAND ${Python_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE)
     if (NOT FF_BUILD_FROM_PYPI)
       install(
         DIRECTORY ${FLEXFLOW_ROOT}/python/flexflow/

From 522f1c1ec422d6f433bfbafcdd14b3b39625ac02 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 9 Jan 2024 22:19:18 +0000
Subject: [PATCH 03/12] bug fix

---
 cmake/pip_install/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/pip_install/CMakeLists.txt b/cmake/pip_install/CMakeLists.txt
index 7ce38c4abc..105133a310 100644
--- a/cmake/pip_install/CMakeLists.txt
+++ b/cmake/pip_install/CMakeLists.txt
@@ -1,10 +1,10 @@
 # Use setup.py script to re-install the Python bindings library with the right library paths
 if (FF_USE_PYTHON)
-    execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE)  
+    execute_process(COMMAND ${Python_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE)  
     if(FF_BUILD_FROM_PYPI)
         install(CODE "execute_process(COMMAND ${CMAKE_COMMAND} -E echo \"Editing path to Legion library using path: ${PY_DEST}/flexflow/lib \")")
         # CMAKE_CURRENT_SOURCE_DIR=/usr/FlexFlow/cmake/pip_install 
         # Legion_BINARY_DIR=/usr/FlexFlow/build/<something>/deps/legion
-        install(CODE "execute_process(COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python/setup.py install --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${PY_DEST}/flexflow ${Legion_PYTHON_EXTRA_INSTALL_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python)")
+        install(CODE "execute_process(COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python/setup.py install --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${PY_DEST}/flexflow ${Legion_PYTHON_EXTRA_INSTALL_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python)")
     endif()
 endif()

From 0630f6df502e87151340acc7f6a25a1d3a5c1ad0 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 19 Jan 2024 23:47:46 +0000
Subject: [PATCH 04/12] update legion

---
 deps/legion | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deps/legion b/deps/legion
index 626b55689c..d065278678 160000
--- a/deps/legion
+++ b/deps/legion
@@ -1 +1 @@
-Subproject commit 626b55689c77848b246e1da19678c7ad58899f0c
+Subproject commit d0652786784249e933dd62f675591da99a5e960d

From 45a1b783b360bc7cc35cfdf2868e33fe5a9ad031 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sat, 20 Jan 2024 01:04:44 +0000
Subject: [PATCH 05/12] fix arithmetic error due to num_devices uninitialized

---
 include/flexflow/request_manager.h |  1 -
 src/runtime/inference_manager.cc   | 30 +-----------------------------
 2 files changed, 1 insertion(+), 30 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 50a51705cd..4763eb1ef3 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -55,7 +55,6 @@ class InferenceManager {
 public:
   std::unordered_map<ParallelTensor, std::vector<ParallelTensor>> tensor_buffer;
   std::unordered_map<FFModel *, FileDataLoader *> model_weights_loaders;
-  int num_devices;
 };
 
 struct Request {
diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc
index 6588cbceeb..2a94df8b4d 100644
--- a/src/runtime/inference_manager.cc
+++ b/src/runtime/inference_manager.cc
@@ -28,33 +28,7 @@ using namespace Legion;
 LegionRuntime::Logger::Category log_inf_mgr("InferenceManager");
 LegionRuntime::Logger::Category log_offload("Offloading");
 
-InferenceManager::InferenceManager() {
-#ifdef DEADCODE
-  num_devices = ff_config.workersPerNode * ff_config.numNodes;
-  // Check parallelization degrees
-  assert(ff_config.data_parallelism_degree <= num_devices &&
-         "Data parallelism degree exceeds number of available devices");
-  assert(num_devices % ff_config.data_parallelism_degree == 0 &&
-         "Number of available devices is not divisible by data parallelism "
-         "degree");
-  assert(ff_config.tensor_parallelism_degree <= num_devices &&
-         "Tensor parallelism degree exceeds number of available devices");
-  assert(num_devices % ff_config.tensor_parallelism_degree == 0 &&
-         "Number of available devices is not divisible by tensor parallelism "
-         "degree");
-  assert(ff_config.pipeline_parallelism_degree <= num_devices &&
-         "Pipeline parallelism degree exceeds number of available devices");
-  assert(num_devices % ff_config.pipeline_parallelism_degree == 0 &&
-         "Number of available devices is not divisible by pipeline parallelism "
-         "degree");
-  assert(ff_config.data_parallelism_degree *
-                 ff_config.tensor_parallelism_degree *
-                 ff_config.pipeline_parallelism_degree ==
-             num_devices &&
-         "Product of data, tensor, and pipeline parallelism degrees does not "
-         "match the number of available devices");
-#endif
-}
+InferenceManager::InferenceManager() {}
 
 InferenceManager *inference_manager_singleton = nullptr;
 
@@ -296,8 +270,6 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) {
 void InferenceManager::init_operators_inference(FFModel *model) {
   for (int batch_index = 0; batch_index < model->config.data_parallelism_degree;
        batch_index++) {
-    int expert_device_index = 0;
-    int device_index = batch_index % num_devices;
     for (size_t o = 0; o < model->operators.size(); o++) {
       Op *op = model->operators[o];
       if (op->op_type == OP_WEIGHT) {

From 1646b4388efc264ba7843d8f9b2e9f354f0ba462 Mon Sep 17 00:00:00 2001
From: Zhihao Jia <zhihao@cmu.edu>
Date: Sat, 20 Jan 2024 11:08:30 -0500
Subject: [PATCH 06/12] update legion version

---
 deps/legion | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deps/legion b/deps/legion
index d065278678..ef6c499753 160000
--- a/deps/legion
+++ b/deps/legion
@@ -1 +1 @@
-Subproject commit d0652786784249e933dd62f675591da99a5e960d
+Subproject commit ef6c499753cebb7bb44ac52c109144a9e6a1c577

From 8e7a7d01f4db35ef6e73b3cf76417fedce6f05ef Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sat, 20 Jan 2024 18:23:47 -0500
Subject: [PATCH 07/12] update ci

---
 .github/workflows/gpu-ci.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml
index 3901d6b5f7..9cd6b82087 100644
--- a/.github/workflows/gpu-ci.yml
+++ b/.github/workflows/gpu-ci.yml
@@ -243,7 +243,7 @@ jobs:
 
       - name: Build and Install FlexFlow
         run: |
-          export PATH=/opt/conda/bin:$PATH
+          export PATH=$CONDA_PREFIX/bin:$PATH
           export FF_HOME=$(pwd)
           export FF_BUILD_ALL_EXAMPLES=ON
           export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON
@@ -252,18 +252,18 @@ jobs:
 
       - name: Check FlexFlow Python interface (pip)
         run: |
-          export PATH=/opt/conda/bin:$PATH
+          export PATH=$CONDA_PREFIX/bin:$PATH
           export FF_HOME=$(pwd)
-          export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib
+          export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib
           ./tests/python_interface_test.sh after-installation
 
       - name: Run multi-gpu tests
         run: |
-          export PATH=/opt/conda/bin:$PATH
+          export PATH=$CONDA_PREFIX/bin:$PATH
           export CUDNN_DIR=/usr/local/cuda
           export CUDA_DIR=/usr/local/cuda
           export FF_HOME=$(pwd)
-          export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib
+          export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib
           # C++ tests
           ./tests/cpp_gpu_tests.sh 4
           # Python tests

From f9932aeb6f191c29a2d4ba7dbb056eb059d43adc Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sat, 20 Jan 2024 18:41:35 -0500
Subject: [PATCH 08/12] fix

---
 .github/workflows/gpu-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml
index 9cd6b82087..48dcda157e 100644
--- a/.github/workflows/gpu-ci.yml
+++ b/.github/workflows/gpu-ci.yml
@@ -222,7 +222,7 @@ jobs:
       CONDA: "3"
     needs: inference-tests
     container:
-      image: ghcr.io/flexflow/flexflow-environment-cuda:latest
+      image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
       options: --gpus all --shm-size=8192m
     steps:
       - name: Install updated git version

From 0b3148ef6adfcb64935e6b1e83a88494910a7b22 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sat, 20 Jan 2024 18:42:08 -0500
Subject: [PATCH 09/12] debugging ci

---
 .github/workflows/gpu-ci.yml | 264 +++++++++++++++++------------------
 1 file changed, 132 insertions(+), 132 deletions(-)

diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml
index 48dcda157e..eb0046617b 100644
--- a/.github/workflows/gpu-ci.yml
+++ b/.github/workflows/gpu-ci.yml
@@ -60,155 +60,155 @@ jobs:
           pip3 install pygithub
           python3 .github/workflows/helpers/gpu_ci_helper.py
 
-  python-interface-check:
-    name: Check Python Interface
-    runs-on: [self-hosted, gpu]
-    defaults:
-      run:
-        shell: bash -l {0} # required to use an activated conda environment
-    env: 
-      CONDA: "3"    
-    needs: gpu-ci-concierge
-    container:
-      image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
-      options: --gpus all --shm-size=8192m
-    steps:
-      - name: Install updated git version
-        run: sudo add-apt-repository ppa:git-core/ppa -y && sudo apt update -y && sudo apt install -y --no-install-recommends git
+  # python-interface-check:
+  #   name: Check Python Interface
+  #   runs-on: [self-hosted, gpu]
+  #   defaults:
+  #     run:
+  #       shell: bash -l {0} # required to use an activated conda environment
+  #   env: 
+  #     CONDA: "3"    
+  #   needs: gpu-ci-concierge
+  #   container:
+  #     image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
+  #     options: --gpus all --shm-size=8192m
+  #   steps:
+  #     - name: Install updated git version
+  #       run: sudo add-apt-repository ppa:git-core/ppa -y && sudo apt update -y && sudo apt install -y --no-install-recommends git
 
-      - name: Checkout Git Repository
-        uses: actions/checkout@v3
-        with:
-          submodules: recursive
+  #     - name: Checkout Git Repository
+  #       uses: actions/checkout@v3
+  #       with:
+  #         submodules: recursive
           
-      - name: Install conda and FlexFlow dependencies
-        uses: conda-incubator/setup-miniconda@v2
-        with:
-          miniconda-version: "latest"
-          activate-environment: flexflow
-          environment-file: conda/flexflow.yml
-          auto-activate-base: false
-          auto-update-conda: false
+  #     - name: Install conda and FlexFlow dependencies
+  #       uses: conda-incubator/setup-miniconda@v2
+  #       with:
+  #         miniconda-version: "latest"
+  #         activate-environment: flexflow
+  #         environment-file: conda/flexflow.yml
+  #         auto-activate-base: false
+  #         auto-update-conda: false
 
-      - name: Install conda and Pytorch dependencies for pytorch alignment test
-        run: |
-          conda env create -f conda/pytorch-gpu.yml
+  #     - name: Install conda and Pytorch dependencies for pytorch alignment test
+  #       run: |
+  #         conda env create -f conda/pytorch-gpu.yml
 
-      - name: Build FlexFlow
-        run: |
-          export PATH=$CONDA_PREFIX/bin:$PATH
-          export FF_HOME=$(pwd)
-          export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion
-          mkdir build
-          cd build
-          ../config/config.linux
-          make -j
+  #     - name: Build FlexFlow
+  #       run: |
+  #         export PATH=$CONDA_PREFIX/bin:$PATH
+  #         export FF_HOME=$(pwd)
+  #         export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion
+  #         mkdir build
+  #         cd build
+  #         ../config/config.linux
+  #         make -j
 
-      - name: Check FlexFlow Python interface (before installation)
-        run: |
-          export PATH=$CONDA_PREFIX/bin:$PATH
-          export FF_HOME=$(pwd)
-          export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib
-          ./tests/python_interface_test.sh before-installation
+  #     - name: Check FlexFlow Python interface (before installation)
+  #       run: |
+  #         export PATH=$CONDA_PREFIX/bin:$PATH
+  #         export FF_HOME=$(pwd)
+  #         export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib
+  #         ./tests/python_interface_test.sh before-installation
 
-      - name: Install FlexFlow
-        run: |
-          export PATH=$CONDA_PREFIX/bin:$PATH
-          export FF_HOME=$(pwd)
-          export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion
-          cd build
-          ../config/config.linux
-          make install
-          ldconfig
+  #     - name: Install FlexFlow
+  #       run: |
+  #         export PATH=$CONDA_PREFIX/bin:$PATH
+  #         export FF_HOME=$(pwd)
+  #         export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion
+  #         cd build
+  #         ../config/config.linux
+  #         make install
+  #         ldconfig
 
-      - name: Check FlexFlow Python interface (after installation)
-        run: |
-          export PATH=$CONDA_PREFIX/bin:$PATH
-          export FF_HOME=$(pwd)
-          export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib
-          ./tests/python_interface_test.sh after-installation
+  #     - name: Check FlexFlow Python interface (after installation)
+  #       run: |
+  #         export PATH=$CONDA_PREFIX/bin:$PATH
+  #         export FF_HOME=$(pwd)
+  #         export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib
+  #         ./tests/python_interface_test.sh after-installation
 
-      - name: Run flexflow alignment with pytorch
-        run: |
-          # run alingment tests
-          export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib
-          ./tests/align/test_all_operators.sh
+  #     - name: Run flexflow alignment with pytorch
+  #       run: |
+  #         # run alingment tests
+  #         export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib
+  #         ./tests/align/test_all_operators.sh
 
-  inference-tests:
-    name: Inference Tests
-    runs-on: [self-hosted, gpu]
-    defaults:
-      run:
-        shell: bash -l {0} # required to use an activated conda environment
-    env: 
-      CONDA: "3"
-      HUGGINGFACE_TOKEN: ${{ secrets.HUGGINGFACE_TOKEN }}
-    needs: gpu-ci-concierge
-    container:
-      image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
-      options: --gpus all --shm-size=8192m
-    steps:
-      - name: Install updated git version
-        run: sudo add-apt-repository ppa:git-core/ppa -y && sudo apt update -y && sudo apt install -y --no-install-recommends git
+  # inference-tests:
+  #   name: Inference Tests
+  #   runs-on: [self-hosted, gpu]
+  #   defaults:
+  #     run:
+  #       shell: bash -l {0} # required to use an activated conda environment
+  #   env: 
+  #     CONDA: "3"
+  #     HUGGINGFACE_TOKEN: ${{ secrets.HUGGINGFACE_TOKEN }}
+  #   needs: gpu-ci-concierge
+  #   container:
+  #     image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
+  #     options: --gpus all --shm-size=8192m
+  #   steps:
+  #     - name: Install updated git version
+  #       run: sudo add-apt-repository ppa:git-core/ppa -y && sudo apt update -y && sudo apt install -y --no-install-recommends git
 
-      - name: Checkout Git Repository
-        uses: actions/checkout@v3
-        with:
-          submodules: recursive
+  #     - name: Checkout Git Repository
+  #       uses: actions/checkout@v3
+  #       with:
+  #         submodules: recursive
           
-      - name: Install conda and FlexFlow dependencies
-        uses: conda-incubator/setup-miniconda@v2
-        with:
-          miniconda-version: "latest"
-          activate-environment: flexflow
-          environment-file: conda/flexflow.yml
-          auto-activate-base: false
+  #     - name: Install conda and FlexFlow dependencies
+  #       uses: conda-incubator/setup-miniconda@v2
+  #       with:
+  #         miniconda-version: "latest"
+  #         activate-environment: flexflow
+  #         environment-file: conda/flexflow.yml
+  #         auto-activate-base: false
 
-      - name: Build FlexFlow
-        run: |
-          export PATH=$CONDA_PREFIX/bin:$PATH
-          export FF_HOME=$(pwd)
-          export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion
-          export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON
-          mkdir build
-          cd build
-          ../config/config.linux
-          make -j
+  #     - name: Build FlexFlow
+  #       run: |
+  #         export PATH=$CONDA_PREFIX/bin:$PATH
+  #         export FF_HOME=$(pwd)
+  #         export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion
+  #         export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON
+  #         mkdir build
+  #         cd build
+  #         ../config/config.linux
+  #         make -j
 
-      - name: Run inference tests
-        env:
-          CPP_INFERENCE_TESTS: ${{ vars.CPP_INFERENCE_TESTS }}
-        run: |
-          export PATH=$CONDA_PREFIX/bin:$PATH
-          export FF_HOME=$(pwd)
-          export CUDNN_DIR=/usr/local/cuda
-          export CUDA_DIR=/usr/local/cuda
-          export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib
+  #     - name: Run inference tests
+  #       env:
+  #         CPP_INFERENCE_TESTS: ${{ vars.CPP_INFERENCE_TESTS }}
+  #       run: |
+  #         export PATH=$CONDA_PREFIX/bin:$PATH
+  #         export FF_HOME=$(pwd)
+  #         export CUDNN_DIR=/usr/local/cuda
+  #         export CUDA_DIR=/usr/local/cuda
+  #         export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib
           
-          # GPT tokenizer test
-          # ./tests/gpt_tokenizer_test.sh
+  #         # GPT tokenizer test
+  #         # ./tests/gpt_tokenizer_test.sh
 
-          # Inference tests
-          source ./build/set_python_envs.sh
-          ./tests/inference_tests.sh
+  #         # Inference tests
+  #         source ./build/set_python_envs.sh
+  #         ./tests/inference_tests.sh
       
-      - name: Save inference output as an artifact
-        if: always()
-        run: | 
-          cd inference
-          tar -zcvf output.tar.gz ./output
+  #     - name: Save inference output as an artifact
+  #       if: always()
+  #       run: | 
+  #         cd inference
+  #         tar -zcvf output.tar.gz ./output
 
-      - name: Upload artifact
-        uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: output
-          path: inference/output.tar.gz
+  #     - name: Upload artifact
+  #       uses: actions/upload-artifact@v3
+  #       if: always()
+  #       with:
+  #         name: output
+  #         path: inference/output.tar.gz
       
-      # Github persists the .cache folder across different runs/containers
-      - name: Clear cache
-        if: always()
-        run: sudo rm -rf ~/.cache 
+  #     # Github persists the .cache folder across different runs/containers
+  #     - name: Clear cache
+  #       if: always()
+  #       run: sudo rm -rf ~/.cache 
 
   training-tests:
     name: Training Tests
@@ -220,7 +220,7 @@ jobs:
         shell: bash -l {0} # required to use an activated conda environment
     env: 
       CONDA: "3"
-    needs: inference-tests
+    # needs: inference-tests
     container:
       image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
       options: --gpus all --shm-size=8192m

From d1c541f24a3466afdf6510e12e8399343cf47baa Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sat, 20 Jan 2024 18:53:35 -0500
Subject: [PATCH 10/12] Revert "debugging ci"

This reverts commit 0b3148ef6adfcb64935e6b1e83a88494910a7b22.
---
 .github/workflows/gpu-ci.yml | 264 +++++++++++++++++------------------
 1 file changed, 132 insertions(+), 132 deletions(-)

diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml
index eb0046617b..48dcda157e 100644
--- a/.github/workflows/gpu-ci.yml
+++ b/.github/workflows/gpu-ci.yml
@@ -60,155 +60,155 @@ jobs:
           pip3 install pygithub
           python3 .github/workflows/helpers/gpu_ci_helper.py
 
-  # python-interface-check:
-  #   name: Check Python Interface
-  #   runs-on: [self-hosted, gpu]
-  #   defaults:
-  #     run:
-  #       shell: bash -l {0} # required to use an activated conda environment
-  #   env: 
-  #     CONDA: "3"    
-  #   needs: gpu-ci-concierge
-  #   container:
-  #     image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
-  #     options: --gpus all --shm-size=8192m
-  #   steps:
-  #     - name: Install updated git version
-  #       run: sudo add-apt-repository ppa:git-core/ppa -y && sudo apt update -y && sudo apt install -y --no-install-recommends git
+  python-interface-check:
+    name: Check Python Interface
+    runs-on: [self-hosted, gpu]
+    defaults:
+      run:
+        shell: bash -l {0} # required to use an activated conda environment
+    env: 
+      CONDA: "3"    
+    needs: gpu-ci-concierge
+    container:
+      image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
+      options: --gpus all --shm-size=8192m
+    steps:
+      - name: Install updated git version
+        run: sudo add-apt-repository ppa:git-core/ppa -y && sudo apt update -y && sudo apt install -y --no-install-recommends git
 
-  #     - name: Checkout Git Repository
-  #       uses: actions/checkout@v3
-  #       with:
-  #         submodules: recursive
+      - name: Checkout Git Repository
+        uses: actions/checkout@v3
+        with:
+          submodules: recursive
           
-  #     - name: Install conda and FlexFlow dependencies
-  #       uses: conda-incubator/setup-miniconda@v2
-  #       with:
-  #         miniconda-version: "latest"
-  #         activate-environment: flexflow
-  #         environment-file: conda/flexflow.yml
-  #         auto-activate-base: false
-  #         auto-update-conda: false
+      - name: Install conda and FlexFlow dependencies
+        uses: conda-incubator/setup-miniconda@v2
+        with:
+          miniconda-version: "latest"
+          activate-environment: flexflow
+          environment-file: conda/flexflow.yml
+          auto-activate-base: false
+          auto-update-conda: false
 
-  #     - name: Install conda and Pytorch dependencies for pytorch alignment test
-  #       run: |
-  #         conda env create -f conda/pytorch-gpu.yml
+      - name: Install conda and Pytorch dependencies for pytorch alignment test
+        run: |
+          conda env create -f conda/pytorch-gpu.yml
 
-  #     - name: Build FlexFlow
-  #       run: |
-  #         export PATH=$CONDA_PREFIX/bin:$PATH
-  #         export FF_HOME=$(pwd)
-  #         export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion
-  #         mkdir build
-  #         cd build
-  #         ../config/config.linux
-  #         make -j
+      - name: Build FlexFlow
+        run: |
+          export PATH=$CONDA_PREFIX/bin:$PATH
+          export FF_HOME=$(pwd)
+          export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion
+          mkdir build
+          cd build
+          ../config/config.linux
+          make -j
 
-  #     - name: Check FlexFlow Python interface (before installation)
-  #       run: |
-  #         export PATH=$CONDA_PREFIX/bin:$PATH
-  #         export FF_HOME=$(pwd)
-  #         export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib
-  #         ./tests/python_interface_test.sh before-installation
+      - name: Check FlexFlow Python interface (before installation)
+        run: |
+          export PATH=$CONDA_PREFIX/bin:$PATH
+          export FF_HOME=$(pwd)
+          export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib
+          ./tests/python_interface_test.sh before-installation
 
-  #     - name: Install FlexFlow
-  #       run: |
-  #         export PATH=$CONDA_PREFIX/bin:$PATH
-  #         export FF_HOME=$(pwd)
-  #         export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion
-  #         cd build
-  #         ../config/config.linux
-  #         make install
-  #         ldconfig
+      - name: Install FlexFlow
+        run: |
+          export PATH=$CONDA_PREFIX/bin:$PATH
+          export FF_HOME=$(pwd)
+          export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion
+          cd build
+          ../config/config.linux
+          make install
+          ldconfig
 
-  #     - name: Check FlexFlow Python interface (after installation)
-  #       run: |
-  #         export PATH=$CONDA_PREFIX/bin:$PATH
-  #         export FF_HOME=$(pwd)
-  #         export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib
-  #         ./tests/python_interface_test.sh after-installation
+      - name: Check FlexFlow Python interface (after installation)
+        run: |
+          export PATH=$CONDA_PREFIX/bin:$PATH
+          export FF_HOME=$(pwd)
+          export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib
+          ./tests/python_interface_test.sh after-installation
 
-  #     - name: Run flexflow alignment with pytorch
-  #       run: |
-  #         # run alingment tests
-  #         export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib
-  #         ./tests/align/test_all_operators.sh
+      - name: Run flexflow alignment with pytorch
+        run: |
+          # run alingment tests
+          export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib
+          ./tests/align/test_all_operators.sh
 
-  # inference-tests:
-  #   name: Inference Tests
-  #   runs-on: [self-hosted, gpu]
-  #   defaults:
-  #     run:
-  #       shell: bash -l {0} # required to use an activated conda environment
-  #   env: 
-  #     CONDA: "3"
-  #     HUGGINGFACE_TOKEN: ${{ secrets.HUGGINGFACE_TOKEN }}
-  #   needs: gpu-ci-concierge
-  #   container:
-  #     image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
-  #     options: --gpus all --shm-size=8192m
-  #   steps:
-  #     - name: Install updated git version
-  #       run: sudo add-apt-repository ppa:git-core/ppa -y && sudo apt update -y && sudo apt install -y --no-install-recommends git
+  inference-tests:
+    name: Inference Tests
+    runs-on: [self-hosted, gpu]
+    defaults:
+      run:
+        shell: bash -l {0} # required to use an activated conda environment
+    env: 
+      CONDA: "3"
+      HUGGINGFACE_TOKEN: ${{ secrets.HUGGINGFACE_TOKEN }}
+    needs: gpu-ci-concierge
+    container:
+      image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
+      options: --gpus all --shm-size=8192m
+    steps:
+      - name: Install updated git version
+        run: sudo add-apt-repository ppa:git-core/ppa -y && sudo apt update -y && sudo apt install -y --no-install-recommends git
 
-  #     - name: Checkout Git Repository
-  #       uses: actions/checkout@v3
-  #       with:
-  #         submodules: recursive
+      - name: Checkout Git Repository
+        uses: actions/checkout@v3
+        with:
+          submodules: recursive
           
-  #     - name: Install conda and FlexFlow dependencies
-  #       uses: conda-incubator/setup-miniconda@v2
-  #       with:
-  #         miniconda-version: "latest"
-  #         activate-environment: flexflow
-  #         environment-file: conda/flexflow.yml
-  #         auto-activate-base: false
+      - name: Install conda and FlexFlow dependencies
+        uses: conda-incubator/setup-miniconda@v2
+        with:
+          miniconda-version: "latest"
+          activate-environment: flexflow
+          environment-file: conda/flexflow.yml
+          auto-activate-base: false
 
-  #     - name: Build FlexFlow
-  #       run: |
-  #         export PATH=$CONDA_PREFIX/bin:$PATH
-  #         export FF_HOME=$(pwd)
-  #         export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion
-  #         export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON
-  #         mkdir build
-  #         cd build
-  #         ../config/config.linux
-  #         make -j
+      - name: Build FlexFlow
+        run: |
+          export PATH=$CONDA_PREFIX/bin:$PATH
+          export FF_HOME=$(pwd)
+          export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion
+          export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON
+          mkdir build
+          cd build
+          ../config/config.linux
+          make -j
 
-  #     - name: Run inference tests
-  #       env:
-  #         CPP_INFERENCE_TESTS: ${{ vars.CPP_INFERENCE_TESTS }}
-  #       run: |
-  #         export PATH=$CONDA_PREFIX/bin:$PATH
-  #         export FF_HOME=$(pwd)
-  #         export CUDNN_DIR=/usr/local/cuda
-  #         export CUDA_DIR=/usr/local/cuda
-  #         export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib
+      - name: Run inference tests
+        env:
+          CPP_INFERENCE_TESTS: ${{ vars.CPP_INFERENCE_TESTS }}
+        run: |
+          export PATH=$CONDA_PREFIX/bin:$PATH
+          export FF_HOME=$(pwd)
+          export CUDNN_DIR=/usr/local/cuda
+          export CUDA_DIR=/usr/local/cuda
+          export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib
           
-  #         # GPT tokenizer test
-  #         # ./tests/gpt_tokenizer_test.sh
+          # GPT tokenizer test
+          # ./tests/gpt_tokenizer_test.sh
 
-  #         # Inference tests
-  #         source ./build/set_python_envs.sh
-  #         ./tests/inference_tests.sh
+          # Inference tests
+          source ./build/set_python_envs.sh
+          ./tests/inference_tests.sh
       
-  #     - name: Save inference output as an artifact
-  #       if: always()
-  #       run: | 
-  #         cd inference
-  #         tar -zcvf output.tar.gz ./output
+      - name: Save inference output as an artifact
+        if: always()
+        run: | 
+          cd inference
+          tar -zcvf output.tar.gz ./output
 
-  #     - name: Upload artifact
-  #       uses: actions/upload-artifact@v3
-  #       if: always()
-  #       with:
-  #         name: output
-  #         path: inference/output.tar.gz
+      - name: Upload artifact
+        uses: actions/upload-artifact@v3
+        if: always()
+        with:
+          name: output
+          path: inference/output.tar.gz
       
-  #     # Github persists the .cache folder across different runs/containers
-  #     - name: Clear cache
-  #       if: always()
-  #       run: sudo rm -rf ~/.cache 
+      # Github persists the .cache folder across different runs/containers
+      - name: Clear cache
+        if: always()
+        run: sudo rm -rf ~/.cache 
 
   training-tests:
     name: Training Tests
@@ -220,7 +220,7 @@ jobs:
         shell: bash -l {0} # required to use an activated conda environment
     env: 
       CONDA: "3"
-    # needs: inference-tests
+    needs: inference-tests
     container:
       image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
       options: --gpus all --shm-size=8192m

From 96bc34bc1f4e42839e0ba7afd6eebc31ffbc169a Mon Sep 17 00:00:00 2001
From: Zhihao Jia <zhihao@cmu.edu>
Date: Thu, 25 Jan 2024 18:21:42 -0500
Subject: [PATCH 11/12] update mapper interface

---
 deps/legion               |  2 +-
 include/flexflow/mapper.h |  9 ++++----
 src/mapper/mapper.cc      | 47 +++++++++++++++++----------------------
 3 files changed, 25 insertions(+), 33 deletions(-)

diff --git a/deps/legion b/deps/legion
index ef6c499753..24e8c45234 160000
--- a/deps/legion
+++ b/deps/legion
@@ -1 +1 @@
-Subproject commit ef6c499753cebb7bb44ac52c109144a9e6a1c577
+Subproject commit 24e8c452341dea41427e0ce61e154d61715e6835
diff --git a/include/flexflow/mapper.h b/include/flexflow/mapper.h
index 71be1892aa..e8337818ec 100644
--- a/include/flexflow/mapper.h
+++ b/include/flexflow/mapper.h
@@ -83,11 +83,10 @@ class FFMapper : public NullMapper {
                         Task const &task,
                         MapTaskInput const &input,
                         MapTaskOutput &output);
-  virtual void map_replicate_task(const MapperContext ctx,
-                                  Task const &task,
-                                  MapTaskInput const &input,
-                                  MapTaskOutput const &default_output,
-                                  MapReplicateTaskOutput &output);
+  virtual void replicate_task(const MapperContext ctx,
+                              Task const &task,
+                              ReplicateTaskInput const &input,
+                              ReplicateTaskOutput &output);
   virtual void select_task_variant(const MapperContext ctx,
                                    Task const &task,
                                    SelectVariantInput const &input,
diff --git a/src/mapper/mapper.cc b/src/mapper/mapper.cc
index bc26a79d3e..d46bfc2877 100644
--- a/src/mapper/mapper.cc
+++ b/src/mapper/mapper.cc
@@ -661,44 +661,37 @@ void FFMapper::map_task(const MapperContext ctx,
   } // for idx
 }
 
-void FFMapper::map_replicate_task(const MapperContext ctx,
-                                  Task const &task,
-                                  MapTaskInput const &input,
-                                  MapTaskOutput const &default_output,
-                                  MapReplicateTaskOutput &output) {
+void FFMapper::replicate_task(const MapperContext ctx,
+                              Task const &task,
+                              ReplicateTaskInput const &input,
+                              ReplicateTaskOutput &output) {
   // Should only be replicated for the top-level task
   assert((task.get_depth() == 0) && (task.regions.size() == 0));
   const Processor::Kind target_kind = task.target_proc.kind();
-  VariantID chosen_variant;
+  VariantID vid;
   {
     std::vector<VariantID> variant_ids;
-    runtime->find_valid_variants(
-        ctx, task.task_id, variant_ids, task.target_proc.kind());
+    runtime->find_valid_variants(ctx, task.task_id, variant_ids, target_kind);
     // Currently assume there is exactly one variant
     assert(variant_ids.size() == 1);
-    chosen_variant = variant_ids[0];
+    output.chosen_variant = variant_ids[0];
   }
-  std::vector<Processor> const &all_procs = all_procs_by_kind(target_kind);
-  // Place on replicate on each node by default
-  output.task_mappings.resize(total_nodes, default_output);
-  // Assume default_output does not include any target_procs
-  assert(default_output.target_procs.size() == 0);
-  for (std::vector<Processor>::const_iterator it = all_procs.begin();
-       it != all_procs.end();
+  output.target_processors.resize(total_nodes);
+  std::vector<bool> handled(total_nodes, false);
+  size_t count = 0;
+  Machine::ProcessorQuery procs(machine);
+  procs.only_kind(target_kind);
+  for (Machine::ProcessorQuery::iterator it = procs.begin(); it != procs.end();
        it++) {
-    AddressSpace space = it->address_space();
-    assert(space < output.task_mappings.size());
-    // Add *it as a target_proc if we haven't found one
-    if (output.task_mappings[space].target_procs.size() == 0) {
-      output.task_mappings[space].target_procs.push_back(*it);
+    const AddressSpace space = it->address_space();
+    if (handled[space]) {
+      continue;
     }
+    output.target_processors[space] = *it;
+    handled[space] = true;
+    count++;
   }
-  output.control_replication_map.resize(total_nodes);
-  for (int idx = 0; idx < total_nodes; idx++) {
-    output.task_mappings[idx].chosen_variant = chosen_variant;
-    output.control_replication_map[idx] =
-        output.task_mappings[idx].target_procs[0];
-  }
+  assert(count == total_nodes);
 }
 
 void FFMapper::select_task_variant(const MapperContext ctx,

From 573380d1ef502afdd53a01bffe38b21c2d39cd6f Mon Sep 17 00:00:00 2001
From: Zhihao Jia <zhihao@cmu.edu>
Date: Thu, 25 Jan 2024 22:51:55 -0500
Subject: [PATCH 12/12] add ncclFinalize

---
 include/flexflow/model.h    |  2 ++
 include/flexflow/operator.h |  5 +++
 src/runtime/model.cc        | 61 +++++++++++++++++++++++++++++++++++++
 3 files changed, 68 insertions(+)

diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index dd6dc76b4d..95be9ab581 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -202,6 +202,7 @@ enum TaskIDs {
   // NCCL tasks
   NCCL_GETUNIQUEID_TASK_ID,
   NCCL_INIT_COMMS_TASK_ID,
+  NCCL_FINISH_COMMS_TASK_ID,
   // Search
   STRATEGY_SEARCH_TASK_ID,
   // Graph
@@ -397,6 +398,7 @@ std::vector<ParallelTensorShape>
 class FFModel {
 public:
   FFModel(FFConfig &config, bool cpu_offload = false);
+  ~FFModel();
 
   static constexpr float PROPAGATION_CHANCE = 0.25;
   static constexpr float CONTINUE_PROPAGATION_CHANCE = 0.75;
diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h
index 73c2c3e092..1b19bdb82f 100644
--- a/include/flexflow/operator.h
+++ b/include/flexflow/operator.h
@@ -406,6 +406,11 @@ class Op {
                            std::vector<Legion::PhysicalRegion> const &regions,
                            Legion::Context ctx,
                            Legion::Runtime *runtime);
+  static void
+      finish_nccl_comms_task(Legion::Task const *task,
+                             std::vector<Legion::PhysicalRegion> const &regions,
+                             Legion::Context ctx,
+                             Legion::Runtime *runtime);
 #endif
 protected:
   void set_argumentmap_for_init(FFModel const &ff, Legion::ArgumentMap &argmap);
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index c07c33efca..f9763627c8 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -606,6 +606,15 @@ ncclComm_t Op::init_nccl_comms_task(Task const *task,
   //     ncclComm, allRanks, myRank, ncclId);
   return ncclComm;
 }
+
+void Op::finish_nccl_comms_task(Task const *task,
+                                std::vector<PhysicalRegion> const &regions,
+                                Context ctx,
+                                Runtime *runtime) {
+  ncclComm_t comm = *((ncclComm_t *)task->local_args);
+  checkNCCL(ncclCommFinalize(comm));
+  checkNCCL(ncclCommDestroy(comm));
+}
 #endif
 
 /**
@@ -1578,6 +1587,43 @@ FFModel::FFModel(FFConfig &_config, bool cpu_offload)
   model_id = model_counter++;
 }
 
+FFModel::~FFModel() {
+  // Destroy nccl communication groups
+#ifdef FF_USE_NCCL
+  Context ctx = config.lg_ctx;
+  Runtime *runtime = config.lg_hlr;
+  for (auto const &comm : view_hash_to_nccl_comms) {
+    // Find the machine view that has the hash
+    MachineView view;
+    for (size_t l = 0; l < operators.size(); l++) {
+      view = operators[l]->outputs[0]->machine_view;
+      if (view.hash() == comm.first) {
+        break;
+      }
+    }
+    assert(view.hash() == comm.first && "Cannot find the machine view");
+    IndexSpace task_is = get_or_create_task_is(view);
+    Domain domain = runtime->get_index_space_domain(ctx, task_is);
+    ArgumentMap argmap;
+    int idx = 0;
+    for (Domain::DomainPointIterator it(domain); it; it++, idx++) {
+      argmap.set_point(*it,
+                       TaskArgument(&comm.second[idx], sizeof(ncclComm_t)));
+    }
+    IndexLauncher index_launcher(NCCL_FINISH_COMMS_TASK_ID,
+                                 task_is,
+                                 TaskArgument(nullptr, 0),
+                                 argmap,
+                                 Predicate::TRUE_PRED,
+                                 false /*must*/,
+                                 0 /*mapper_id*/,
+                                 comm.first);
+    FutureMap fm = runtime->execute_index_space(ctx, index_launcher);
+    fm.wait_all_results();
+  }
+#endif
+}
+
 void FFModel::clear_graph_search_cache() {
   this->graph_search->clear_cache();
   this->search->clear_cache();
@@ -6853,6 +6899,21 @@ void register_flexflow_internal_tasks(Runtime *runtime,
           registrar);
     }
   }
+  {
+    TaskVariantRegistrar registrar(NCCL_FINISH_COMMS_TASK_ID,
+                                   "NCCL Finish Communicators");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<Op::finish_nccl_comms_task>(
+          registrar, "NCCL Finish Communicators Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<Op::finish_nccl_comms_task>(registrar);
+    }
+  }
 #endif
   // Search
   {